In [512]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from src.utility.logger import append_log
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [513]:
# Load your dataset
df = pd.read_csv('data/raw/combined_iasi_no2_meteo_2020_2025_local.csv')

# Check for exact duplicates across all columns
exact_duplicates = df.duplicated().sum()
print(f"Exact duplicates found: {exact_duplicates}")

# View duplicate rows
duplicate_rows = df[df.duplicated(keep=False)]
print(f"Total rows involved in duplication: {len(duplicate_rows)}")

# Check duplicates on specific key columns
key_duplicates = df.duplicated(subset=['datetime']).sum()
print(f"Duplicates based on time: {key_duplicates}")

Exact duplicates found: 0
Total rows involved in duplication: 0
Duplicates based on time: 0


In [514]:
# Remove exact duplicates (keep first occurrence)
df_cleaned = df.drop_duplicates(keep='first')

# For key-based duplicates, investigate first
duplicate_keys = df[df.duplicated(subset=['datetime'], keep=False)]
print("Duplicate key patterns:")
print(duplicate_keys.groupby(['datetime']).size().sort_values(ascending=False))

# Remove duplicates based on key columns after investigation
df_cleaned = df.drop_duplicates(subset=['datetime'], keep='first')

Duplicate key patterns:
Series([], dtype: int64)


In [515]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original dataset: {len(df)} rows",
        f"Exact duplicates removed: {exact_duplicates}",
        f"Key-based duplicates removed: {key_duplicates}",
        f"Final dataset: {len(df_cleaned)} rows"
    ]
)

In [516]:
df.describe()

Unnamed: 0,location_id,sensors_id,lat,lon,value,temp_C,dewpoint_C,slp_hPa,wind_dir_deg,wind_speed_ms,precip_mm
count,22081.0,22081.0,22081.0,22081.0,22081.0,48674.0,48674.0,48674.0,48253.0,6.0,48674.0
mean,9369.0,28602.0,47.1568,27.574886,31.661637,119.3424,58.548198,10085.463286,31.269538,-99.0,165.729178
std,0.0,0.0,3.5e-05,2.1e-05,30.498318,97.136655,100.504995,1349.646573,101.462163,0.0,364.906361
min,9369.0,28602.0,47.156766,27.574866,-1.0,-168.0,-9999.0,-9999.0,1.0,-99.0,0.0
25%,9369.0,28602.0,47.156766,27.574866,15.896116,37.0,0.0,10122.0,10.0,-99.0,5.0
50%,9369.0,28602.0,47.156766,27.574866,29.376155,116.0,58.0,10169.0,26.0,-99.0,8.0
75%,9369.0,28602.0,47.156836,27.574908,43.917378,194.0,124.0,10226.0,30.0,-99.0,8.0
max,9369.0,28602.0,47.156836,27.574908,2217.676463,391.0,236.0,10474.0,999.0,-99.0,999.0


In [517]:
df.head

<bound method NDFrame.head of        location_id  sensors_id   location                   datetime  \
0              NaN         NaN        NaN  2020-01-01 02:00:00+02:00   
1              NaN         NaN        NaN  2020-01-01 03:00:00+02:00   
2              NaN         NaN        NaN  2020-01-01 04:00:00+02:00   
3              NaN         NaN        NaN  2020-01-01 05:00:00+02:00   
4              NaN         NaN        NaN  2020-01-01 06:00:00+02:00   
...            ...         ...        ...                        ...   
49213       9369.0     28602.0  IS-1-9369  2025-08-20 20:00:00+03:00   
49214       9369.0     28602.0  IS-1-9369  2025-08-20 21:00:00+03:00   
49215       9369.0     28602.0  IS-1-9369  2025-08-20 22:00:00+03:00   
49216       9369.0     28602.0  IS-1-9369  2025-08-20 23:00:00+03:00   
49217       9369.0     28602.0  IS-1-9369  2025-08-21 00:00:00+03:00   

             lat        lon parameter  units      value  temp_C  dewpoint_C  \
0            NaN        Na

| Column       | Description                                                                 | Example value              |
|--------------|-----------------------------------------------------------------------------|----------------------------|
| location_id  | Unique numeric identifier of the monitoring location                        | 9369                       |
| sensors_id   | Unique numeric identifier of the sensor within the location                 | 28602                      |
| location     | Station code (often country code + site code + location ID)                 | RO0083A-9369               |
| datetime     | Timestamp of measurement (ISO 8601 with timezone)                          | 2020-08-04T01:00:00+03:00  |
| lat          | Latitude coordinate of the monitoring location                             | 47.1567664986992           |
| lon          | Longitude coordinate of the monitoring location                            | 27.5748656243897           |
| parameter    | Pollutant measured (e.g., `no2`, `pm10`, `pm25`, `o3`, etc.)               | no2                        |
| units        | Units of measurement (varies by parameter)                                 | µg/m³                      |
| value        | Recorded measurement value of the pollutant                                | 51.44521273                |


| Column (raw)       | Description                                                                 | Units (raw)       | Missing code |
|--------------------|-----------------------------------------------------------------------------|------------------|--------------|
| year               | Year (4-digit)                                                             | YYYY             | –            |
| month              | Month (2-digit)                                                            | MM               | –            |
| day                | Day of month (2-digit)                                                     | DD               | –            |
| hour               | Hour of day (UTC, 0–23)                                                     | HH               | –            |
| air temperature    | Air temperature in **tenths of °C**                                         | 0.1 °C           | -9999        |
| dew point temp     | Dew point temperature in **tenths of °C**                                   | 0.1 °C           | -9999        |
| sea level pressure | Sea level pressure in **tenths of hPa**                                     | 0.1 hPa          | -9999        |
| wind direction     | Wind direction from true north (0–360)                                      | degrees          | -999         |
| wind speed         | Wind speed in **tenths of m/s**                                             | 0.1 m/s          | -9999        |
| sky cover          | Cloud cover indicator (coded, e.g., oktas or station code dependent)        | categorical/code | -9999        |
| precipitation      | Precipitation depth during the past hour                                    | mm               | -9999        |

In [518]:
def identify_quality_issues(df):
    """Comprehensive data quality assessment"""
    issues = {}
    
    # Check for impossible values (domain-specific)
    if 'temp_C' in df.columns:
        impossible_temps = df[(df['temp_C'] < -500) | (df['temp_C'] > 700)]
        issues['impossible_temperatures'] = len(impossible_temps)
    
    if 'value' in df.columns:
        impossible_no2 = df[(df['value'] < 0 | (df['value'] > 300))]
        issues['impossible_no2'] = len(impossible_no2)
    
    # Check for future dates
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        future_dates = df[df['timestamp'] > datetime.now()]
        issues['future_dates'] = len(future_dates)
    
    # Check for format inconsistencies
    for col in df.select_dtypes(include=['object']).columns:
        unique_patterns = df[col].astype(str).str.len().value_counts()
        if len(unique_patterns) > 10:  # Many different lengths suggest format issues
            issues[f'{col}_format_inconsistency'] = len(unique_patterns)
    
    return issues

# Run quality assessment
quality_report = identify_quality_issues(df_cleaned)
print("Data Quality Issues Found:")
for issue, count in quality_report.items():
    if count > 0:
        print(f"  {issue}: {count} records")

Data Quality Issues Found:
  impossible_no2: 3930 records


In [519]:
df_cleaned.replace([-9999, -999, -99, 9999, 999], np.nan, inplace=True)
df_cleaned.replace({'value': -1}, np.nan, inplace=True)
df_cleaned.describe()

Unnamed: 0,location_id,sensors_id,lat,lon,value,temp_C,dewpoint_C,slp_hPa,wind_dir_deg,wind_speed_ms,precip_mm
count,22081.0,22081.0,22081.0,22081.0,18151.0,48669.0,48640.0,48443.0,47734.0,0.0,40842.0
mean,9369.0,28602.0,47.1568,27.574886,38.733436,119.364832,59.065399,10175.869269,20.747643,,5.938348
std,0.0,0.0,3.5e-05,2.1e-05,29.16417,97.116429,77.021606,82.033924,10.640461,,2.411636
min,9369.0,28602.0,47.156766,27.574866,0.773662,-168.0,-187.0,9881.0,1.0,,0.0
25%,9369.0,28602.0,47.156766,27.574866,23.577705,37.0,0.0,10123.0,10.0,,4.0
50%,9369.0,28602.0,47.156766,27.574866,33.887936,116.0,58.0,10169.0,26.0,,7.0
75%,9369.0,28602.0,47.156836,27.574908,47.735446,194.0,124.0,10227.0,30.0,,8.0
max,9369.0,28602.0,47.156836,27.574908,2217.676463,391.0,236.0,10474.0,36.0,,9.0


In [520]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Replaced values with missing code with nan [-9999, -999, -99, 9999, 999]",
    ]
)

In [521]:
print(df.dtypes)

location_id      float64
sensors_id       float64
location          object
datetime          object
lat              float64
lon              float64
parameter         object
units             object
value            float64
temp_C           float64
dewpoint_C       float64
slp_hPa          float64
wind_dir_deg     float64
wind_speed_ms    float64
sky_cover         object
precip_mm        float64
dtype: object


In [522]:
# Confirm that date time format is consistent
print(df_cleaned["datetime"].dtype)

# Coerce to datetime with timezone awareness
df_cleaned["datetime"] = pd.to_datetime(df_cleaned["datetime"], errors="coerce", utc=True)
# Convert from UTC to Iași local time (Europe/Bucharest)
df_cleaned["datetime"] = df_cleaned["datetime"].dt.tz_convert("Europe/Bucharest")

print(df_cleaned["datetime"].dtype)      # should show: datetime64[ns, Europe/Bucharest]
print(df_cleaned["datetime"].head(3))    # should print like: 2025-08-20 20:00:00+03:00

object
datetime64[ns, Europe/Bucharest]
0   2020-01-01 02:00:00+02:00
1   2020-01-01 03:00:00+02:00
2   2020-01-01 04:00:00+02:00
Name: datetime, dtype: datetime64[ns, Europe/Bucharest]


In [523]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original datetime type: {df["datetime"].dtype}",
        f"Final type: {df_cleaned["datetime"].dtype}",
    ]
)

In [524]:
conversion_dict = {
    'location_id': 'int64',
    'sensors_id': 'int64',
    'location': 'category',
    'lat': 'float64',
    'lon': 'float64',
    'parameter': 'category',
    'units': 'category',
    'value': 'float64',
    'temp_C': 'float64',
    'dewpoint_C': 'float64',
    'slp_hPa': 'float64',
    'wind_dir_deg': 'float64',
    'wind_speed_ms': 'float64',
    'sky_cover': 'category',
    'precip_mm': 'float64'
}

for column, dtype in conversion_dict.items():
    if column in df_cleaned.columns:
        df_cleaned[column] = df_cleaned[column].astype(dtype, errors='ignore')

In [525]:
print(df_cleaned.dtypes)
print(df_cleaned[20000:20005])

location_id                               float64
sensors_id                                float64
location                                 category
datetime         datetime64[ns, Europe/Bucharest]
lat                                       float64
lon                                       float64
parameter                                category
units                                    category
value                                     float64
temp_C                                    float64
dewpoint_C                                float64
slp_hPa                                   float64
wind_dir_deg                              float64
wind_speed_ms                             float64
sky_cover                                category
precip_mm                                 float64
dtype: object
       location_id  sensors_id      location                  datetime  \
20000       9369.0     28602.0  RO0083A-9369 2022-04-15 01:00:00+03:00   
20001       9369.0     28602.0  RO0083

In [526]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original data typs: {df.dtypes}"
        f"Final data types: {df_cleaned.dtypes}"
    ]
)

In [527]:
# Extract year
df_cleaned["year"] = df_cleaned["datetime"].dt.year

# Count rows with missing air quality data per year
missing_counts = (
    df_cleaned[df_cleaned["location_id"].isna()]
    .groupby("year")
    .size()
)

present_counts = (
    df_cleaned[df_cleaned["location_id"].notna()]
    .groupby("year")
    .size()
)

print("Rows with missing air quality entries per year:")
print(missing_counts)

print("Rows with present air quality entries per year:")
print(present_counts)

Rows with missing air quality entries per year:
year
2020    5893
2021    2082
2022    6693
2023    8684
2024    3286
2025     499
dtype: int64
Rows with present air quality entries per year:
year
2020    2879
2021    6651
2022    2024
2024    5466
2025    5061
dtype: int64


In [528]:
df_cleaned = df_cleaned.drop(columns=["location_id", "sensors_id", "location", "lat", "lon", "parameter", "units", "wind_speed_ms"])
print(df_cleaned.describe())

              value        temp_C    dewpoint_C       slp_hPa  wind_dir_deg  \
count  18151.000000  48669.000000  48640.000000  48443.000000  47734.000000   
mean      38.733436    119.364832     59.065399  10175.869269     20.747643   
std       29.164170     97.116429     77.021606     82.033924     10.640461   
min        0.773662   -168.000000   -187.000000   9881.000000      1.000000   
25%       23.577705     37.000000      0.000000  10123.000000     10.000000   
50%       33.887936    116.000000     58.000000  10169.000000     26.000000   
75%       47.735446    194.000000    124.000000  10227.000000     30.000000   
max     2217.676463    391.000000    236.000000  10474.000000     36.000000   

          precip_mm          year  
count  40842.000000  49218.000000  
mean       5.938348   2022.337092  
std        2.411636      1.637435  
min        0.000000   2020.000000  
25%        4.000000   2021.000000  
50%        7.000000   2022.000000  
75%        8.000000   2024.000000  


In [529]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Dropped columns that don't have variation location_id, sensors_id, location, lat, lon, parameter, units, wind_speed_ms]"
    ]
)

Standard random train/test splitting is not appropriate for time-series data because it assumes that all observations are independent and identically distributed. In reality, air quality and meteorological data are sequential, and each observation is influenced by temporal patterns such as seasonality, daily cycles, or longer-term trends. If the data were shuffled randomly, the model could inadvertently use information from the future to predict the past, creating data leakage and artificially inflating performance metrics.

To address this, the project uses TimeSeriesSplit, which preserves chronological order when creating training and testing datasets. In this approach, the model is always trained on earlier periods and tested on later ones, reflecting the real-world task of forecasting future air quality based on past conditions. Instead of relying on a single cut between train and test sets, TimeSeriesSplit generates multiple rolling splits. This makes it possible to evaluate model performance across different time periods, providing a more robust sense of how well the model generalizes.

This method is particularly valuable for air quality forecasting in Iași, where non-stationary effects such as heating in winter or traffic intensity during specific months can significantly influence pollutant concentrations. By using a time-aware validation strategy, the project ensures that the evaluation reflects these real variations. Ultimately, this leads to more realistic and trustworthy forecasts, which are crucial if the model is to support sustainable urban planning and public health decisions.

In [530]:
def clean_target(X: pd.DataFrame, y: pd.Series, name="y"):
    """
    Drop rows where the target is NaN; keep X/y aligned and report how many dropped.
    Ensures y is numeric for f_regression.
    """
    # ensure numeric target (coerce weird strings to NaN)
    y_num = pd.to_numeric(y, errors="coerce")

    mask = y_num.notna()
    dropped = (~mask).sum()

    if dropped:
        print(f"[clean_target] Dropping {dropped} rows with NaN in {name}")

    X_clean = X.loc[mask]
    y_clean = y_num.loc[mask]

    return X_clean, y_clean

In [531]:
# Create time series splits for cross-validation*
X = df_cleaned.drop(columns=["value"])  # features
y = df_cleaned["value"]
tscv = TimeSeriesSplit(n_splits=2)
folds = {}
for i, (train_idx, test_idx) in enumerate(tscv.split(X), start=1):
    print(f"Fold {i}:")
    print(f"  Train: {train_idx[0]} to {train_idx[-1]}")
    print(f"  Test:  {test_idx[0]} to {test_idx[-1]}")
    folds[f"fold_{i}"] = {
        "X_train": X.iloc[train_idx],
        "X_test": X.iloc[test_idx],
        "y_train": y.iloc[train_idx],
        "y_test": y.iloc[test_idx],
    }

def drop_nan_target_in_folds(folds: dict, target_name="y"):
    for fold, d in folds.items():
        Xtr, ytr = d["X_train"], d["y_train"]
        Xte, yte = d["X_test"],  d["y_test"]

        Xtr, ytr = clean_target(Xtr, ytr, name=f"{fold}_y_train")
        Xte, yte = clean_target(Xte, yte, name=f"{fold}_y_test")

        d["X_train"], d["y_train"] = Xtr, ytr
        d["X_test"],  d["y_test"]  = Xte, yte
    return folds

folds = drop_nan_target_in_folds(folds, target_name="value")

Fold 1:
  Train: 0 to 16405
  Test:  16406 to 32811
Fold 2:
  Train: 0 to 32811
  Test:  32812 to 49217
[clean_target] Dropping 7911 rows with NaN in fold_1_y_train
[clean_target] Dropping 13347 rows with NaN in fold_1_y_test
[clean_target] Dropping 21258 rows with NaN in fold_2_y_train
[clean_target] Dropping 9809 rows with NaN in fold_2_y_test


In [532]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Temporal split using TimeSeriesSplit with n_splits=5",
        f"Dropping rows with nan values in target"
    ]
)

In [533]:

# --- datetime feature extractor (unchanged) ---
def make_dt_block(datetime_col="datetime"):
    def _extract(X: pd.DataFrame):
        dt = pd.to_datetime(X[datetime_col])
        return pd.DataFrame({
            "dt__hour": dt.dt.hour,
            "dt__dow": dt.dt.dayofweek,
            "dt__month": dt.dt.month,
            "dt__is_weekend": (dt.dt.dayofweek >= 5).astype(int),
        }, index=X.index)

    def _feature_names(transformer, input_features=None):
        # must match the columns returned by _extract
        return np.array(["dt__hour", "dt__dow", "dt__month", "dt__is_weekend"])

    return FunctionTransformer(
        _extract,
        feature_names_out=_feature_names
    )

# --- clip target values at 300 (handles sensor malfunctions) ---
def clip_target(y, min_val=0, max_val=300):
    """
    Clip target values to a specified range.
    """
    return y.clip(lower=min_val, upper=max_val)

# CORRECT: Learn all parameters from training data only
def create_preprocessing_pipeline(X_train, y_train, k=20):
    """Create preprocessing pipeline fitted on training data"""

    # Ensure numeric y and drop rows with NaN target (required by f_regression)
    y_train = pd.to_numeric(y_train, errors="coerce")
    y_train = clip_target(y_train, max_val=300)
    mask = y_train.notna()
    if not mask.any():
        raise ValueError("No non-NaN targets in y_train.")
    X_train = X_train.loc[mask]
    y_train = y_train.loc[mask]

    # Identify numeric vs categorical columns (excluding datetime)
    numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
    categorical_features = X_train.select_dtypes(include=["object", "category", "string"]).columns

    # Numeric pipeline: impute with median, then scale
    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # Categorical pipeline: impute with most_frequent, then one-hot encode
    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    # Datetime pipeline: extract (hour/dow/month/is_weekend) then scale
    dt_pipeline = Pipeline(steps=[
        ("dt_extract", make_dt_block("datetime")),
        ("scaler", StandardScaler())
    ])

    # Combine
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_features),
            ("cat", categorical_pipeline, categorical_features),
            ("dt",  dt_pipeline, ["datetime"])
        ]
    )

    # Add feature selection at the end
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("selector", SelectKBest(score_func=f_regression, k=k))
    ])

    # Fit pipeline on training data
    X_train_processed = pipeline.fit_transform(X_train, y_train)

    return X_train_processed, pipeline

def apply_preprocessing_pipeline(X_test, pipeline):
    """
    Apply the fitted preprocessing pipeline to test data.
    `pipeline` is the object returned by create_preprocessing_pipeline(...).
    """
    return pipeline.transform(X_test)

for fold_name, data in folds.items():
        X_train, y_train = data["X_train"], data["y_train"]
        X_test, y_test   = data["X_test"], data["y_test"]

        # Fit preprocessing on training
        X_train_proc, preprocessors = create_preprocessing_pipeline(X_train, y_train)
        # Apply to test
        X_test_proc = apply_preprocessing_pipeline(X_test, preprocessors)

        # Store results back into dict
        data["X_train_proc"] = X_train_proc
        data["X_test_proc"]  = X_test_proc
        data["y_train"] = clip_target(pd.to_numeric(data["y_train"], errors="coerce"), max_val=300)
        data["y_test"]  = clip_target(pd.to_numeric(data["y_test"],  errors="coerce"), max_val=300)
        data["preprocessors"] = preprocessors



In [534]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Preprocessed datasets using preprocessing pipeline, for categorical and numerical features",
    ]
)

In [535]:
def get_feature_names(pipeline):
    try:
        # Features after preprocessing (numeric + one-hot expanded)
        names = pipeline.named_steps["preprocessor"].get_feature_names_out()

        # Apply SelectKBest mask if present
        if "selector" in pipeline.named_steps:
            support = pipeline.named_steps["selector"].get_support()
            names = names[support]
        return list(names)
    except Exception:
        return None

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

for fold_name, data in folds.items():
    pipeline = data.get("preprocessors") or data.get("pipeline")
    feature_names = get_feature_names(pipeline)

    # Wrap arrays with headers if available
    X_train_df = pd.DataFrame(data["X_train_proc"], columns=feature_names)
    X_test_df  = pd.DataFrame(data["X_test_proc"],  columns=feature_names)

    # Targets as Series with header
    y_train_s  = pd.Series(data["y_train"]).reset_index(drop=True)
    y_test_s   = pd.Series(data["y_test"]).reset_index(drop=True)
    y_train_s.name = "target"
    y_test_s.name  = "target"

    # Save
    X_train_df.to_csv(out_dir / f"{fold_name}_X_train.csv", index=False)
    X_test_df.to_csv(out_dir / f"{fold_name}_X_test.csv", index=False)
    y_train_s.to_csv(out_dir / f"{fold_name}_y_train.csv", index=False, header=True)
    y_test_s.to_csv(out_dir / f"{fold_name}_y_test.csv", index=False, header=True)

    print(f"Saved {fold_name} with headers to {out_dir}")

Saved fold_1 with headers to data/processed
Saved fold_2 with headers to data/processed
