In [70]:
import pandas as pd
import numpy as np
from datetime import datetime
from src.utility.logger import append_log
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

In [71]:
# Load your dataset
df = pd.read_csv('data/raw/combined_iasi_no2_meteo_2020_2025_local.csv')

# Check for exact duplicates across all columns
exact_duplicates = df.duplicated().sum()
print(f"Exact duplicates found: {exact_duplicates}")

# View duplicate rows
duplicate_rows = df[df.duplicated(keep=False)]
print(f"Total rows involved in duplication: {len(duplicate_rows)}")

# Check duplicates on specific key columns
key_duplicates = df.duplicated(subset=['datetime']).sum()
print(f"Duplicates based on time: {key_duplicates}")

Exact duplicates found: 0
Total rows involved in duplication: 0
Duplicates based on time: 0


In [72]:
# Remove exact duplicates (keep first occurrence)
df_cleaned = df.drop_duplicates(keep='first')

# For key-based duplicates, investigate first
duplicate_keys = df[df.duplicated(subset=['datetime'], keep=False)]
print("Duplicate key patterns:")
print(duplicate_keys.groupby(['datetime']).size().sort_values(ascending=False))

# Remove duplicates based on key columns after investigation
df_cleaned = df.drop_duplicates(subset=['datetime'], keep='first')

Duplicate key patterns:
Series([], dtype: int64)


In [73]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original dataset: {len(df)} rows",
        f"Exact duplicates removed: {exact_duplicates}",
        f"Key-based duplicates removed: {key_duplicates}",
        f"Final dataset: {len(df_cleaned)} rows"
    ]
)

In [74]:
df.describe()

Unnamed: 0,location_id,sensors_id,lat,lon,value,temp_C,dewpoint_C,slp_hPa,wind_dir_deg,wind_speed_ms,precip_mm
count,22081.0,22081.0,22081.0,22081.0,22081.0,48674.0,48674.0,48674.0,48253.0,6.0,48674.0
mean,9369.0,28602.0,47.1568,27.574886,31.661637,119.3424,58.548198,10085.463286,31.269538,-99.0,165.729178
std,0.0,0.0,3.5e-05,2.1e-05,30.498318,97.136655,100.504995,1349.646573,101.462163,0.0,364.906361
min,9369.0,28602.0,47.156766,27.574866,-1.0,-168.0,-9999.0,-9999.0,1.0,-99.0,0.0
25%,9369.0,28602.0,47.156766,27.574866,15.896116,37.0,0.0,10122.0,10.0,-99.0,5.0
50%,9369.0,28602.0,47.156766,27.574866,29.376155,116.0,58.0,10169.0,26.0,-99.0,8.0
75%,9369.0,28602.0,47.156836,27.574908,43.917378,194.0,124.0,10226.0,30.0,-99.0,8.0
max,9369.0,28602.0,47.156836,27.574908,2217.676463,391.0,236.0,10474.0,999.0,-99.0,999.0


In [75]:
df.head

<bound method NDFrame.head of        location_id  sensors_id   location                   datetime  \
0              NaN         NaN        NaN  2020-01-01 02:00:00+02:00   
1              NaN         NaN        NaN  2020-01-01 03:00:00+02:00   
2              NaN         NaN        NaN  2020-01-01 04:00:00+02:00   
3              NaN         NaN        NaN  2020-01-01 05:00:00+02:00   
4              NaN         NaN        NaN  2020-01-01 06:00:00+02:00   
...            ...         ...        ...                        ...   
49213       9369.0     28602.0  IS-1-9369  2025-08-20 20:00:00+03:00   
49214       9369.0     28602.0  IS-1-9369  2025-08-20 21:00:00+03:00   
49215       9369.0     28602.0  IS-1-9369  2025-08-20 22:00:00+03:00   
49216       9369.0     28602.0  IS-1-9369  2025-08-20 23:00:00+03:00   
49217       9369.0     28602.0  IS-1-9369  2025-08-21 00:00:00+03:00   

             lat        lon parameter  units      value  temp_C  dewpoint_C  \
0            NaN        Na

| Column       | Description                                                                 | Example value              |
|--------------|-----------------------------------------------------------------------------|----------------------------|
| location_id  | Unique numeric identifier of the monitoring location                        | 9369                       |
| sensors_id   | Unique numeric identifier of the sensor within the location                 | 28602                      |
| location     | Station code (often country code + site code + location ID)                 | RO0083A-9369               |
| datetime     | Timestamp of measurement (ISO 8601 with timezone)                          | 2020-08-04T01:00:00+03:00  |
| lat          | Latitude coordinate of the monitoring location                             | 47.1567664986992           |
| lon          | Longitude coordinate of the monitoring location                            | 27.5748656243897           |
| parameter    | Pollutant measured (e.g., `no2`, `pm10`, `pm25`, `o3`, etc.)               | no2                        |
| units        | Units of measurement (varies by parameter)                                 | µg/m³                      |
| value        | Recorded measurement value of the pollutant                                | 51.44521273                |


| Column (raw)       | Description                                                                 | Units (raw)       | Missing code |
|--------------------|-----------------------------------------------------------------------------|------------------|--------------|
| year               | Year (4-digit)                                                             | YYYY             | –            |
| month              | Month (2-digit)                                                            | MM               | –            |
| day                | Day of month (2-digit)                                                     | DD               | –            |
| hour               | Hour of day (UTC, 0–23)                                                     | HH               | –            |
| air temperature    | Air temperature in **tenths of °C**                                         | 0.1 °C           | -9999        |
| dew point temp     | Dew point temperature in **tenths of °C**                                   | 0.1 °C           | -9999        |
| sea level pressure | Sea level pressure in **tenths of hPa**                                     | 0.1 hPa          | -9999        |
| wind direction     | Wind direction from true north (0–360)                                      | degrees          | -999         |
| wind speed         | Wind speed in **tenths of m/s**                                             | 0.1 m/s          | -9999        |
| sky cover          | Cloud cover indicator (coded, e.g., oktas or station code dependent)        | categorical/code | -9999        |
| precipitation      | Precipitation depth during the past hour                                    | mm               | -9999        |

In [76]:
def identify_quality_issues(df):
    """Comprehensive data quality assessment"""
    issues = {}
    
    # Check for impossible values (domain-specific)
    if 'temp_C' in df.columns:
        impossible_temps = df[(df['temp_C'] < -500) | (df['temp_C'] > 700)]
        issues['impossible_temperatures'] = len(impossible_temps)
    
    if 'value' in df.columns:
        impossible_no2 = df[(df['value'] < 0 | (df['value'] > 300))]
        issues['impossible_no2'] = len(impossible_no2)
    
    # Check for future dates
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        future_dates = df[df['timestamp'] > datetime.now()]
        issues['future_dates'] = len(future_dates)
    
    # Check for format inconsistencies
    for col in df.select_dtypes(include=['object']).columns:
        unique_patterns = df[col].astype(str).str.len().value_counts()
        if len(unique_patterns) > 10:  # Many different lengths suggest format issues
            issues[f'{col}_format_inconsistency'] = len(unique_patterns)
    
    return issues

# Run quality assessment
quality_report = identify_quality_issues(df_cleaned)
print("Data Quality Issues Found:")
for issue, count in quality_report.items():
    if count > 0:
        print(f"  {issue}: {count} records")

Data Quality Issues Found:
  impossible_no2: 3930 records


In [77]:
df_cleaned.replace([-9999, -999, -99, 9999, 999], np.nan, inplace=True)
df_cleaned.describe()

Unnamed: 0,location_id,sensors_id,lat,lon,value,temp_C,dewpoint_C,slp_hPa,wind_dir_deg,wind_speed_ms,precip_mm
count,22081.0,22081.0,22081.0,22081.0,22081.0,48669.0,48640.0,48443.0,47734.0,0.0,40842.0
mean,9369.0,28602.0,47.1568,27.574886,31.661637,119.364832,59.065399,10175.869269,20.747643,,5.938348
std,0.0,0.0,3.5e-05,2.1e-05,30.498318,97.116429,77.021606,82.033924,10.640461,,2.411636
min,9369.0,28602.0,47.156766,27.574866,-1.0,-168.0,-187.0,9881.0,1.0,,0.0
25%,9369.0,28602.0,47.156766,27.574866,15.896116,37.0,0.0,10123.0,10.0,,4.0
50%,9369.0,28602.0,47.156766,27.574866,29.376155,116.0,58.0,10169.0,26.0,,7.0
75%,9369.0,28602.0,47.156836,27.574908,43.917378,194.0,124.0,10227.0,30.0,,8.0
max,9369.0,28602.0,47.156836,27.574908,2217.676463,391.0,236.0,10474.0,36.0,,9.0


In [78]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Replaced values with missing code with nan [-9999, -999, -99, 9999, 999]",
    ]
)

In [79]:
print(df.dtypes)

location_id      float64
sensors_id       float64
location          object
datetime          object
lat              float64
lon              float64
parameter         object
units             object
value            float64
temp_C           float64
dewpoint_C       float64
slp_hPa          float64
wind_dir_deg     float64
wind_speed_ms    float64
sky_cover         object
precip_mm        float64
dtype: object


In [80]:
# Confirm that date time format is consistent
print(df_cleaned["datetime"].dtype)

# Coerce to datetime with timezone awareness
df_cleaned["datetime"] = pd.to_datetime(df_cleaned["datetime"], errors="coerce", utc=True)
# Convert from UTC to Iași local time (Europe/Bucharest)
df_cleaned["datetime"] = df_cleaned["datetime"].dt.tz_convert("Europe/Bucharest")

print(df_cleaned["datetime"].dtype)      # should show: datetime64[ns, Europe/Bucharest]
print(df_cleaned["datetime"].head(3))    # should print like: 2025-08-20 20:00:00+03:00

object
datetime64[ns, Europe/Bucharest]
0   2020-01-01 02:00:00+02:00
1   2020-01-01 03:00:00+02:00
2   2020-01-01 04:00:00+02:00
Name: datetime, dtype: datetime64[ns, Europe/Bucharest]


In [81]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original datetime type: {df["datetime"].dtype}",
        f"Final type: {df_cleaned["datetime"].dtype}",
    ]
)

In [82]:
conversion_dict = {
    'location_id': 'int64',
    'sensors_id': 'int64',
    'location': 'category',
    'lat': 'float64',
    'lon': 'float64',
    'parameter': 'category',
    'units': 'category',
    'value': 'float64',
    'temp_C': 'float64',
    'dewpoint_C': 'float64',
    'slp_hPa': 'float64',
    'wind_dir_deg': 'float64',
    'wind_speed_ms': 'float64',
    'sky_cover': 'category',
    'precip_mm': 'float64'
}

for column, dtype in conversion_dict.items():
    if column in df_cleaned.columns:
        df_cleaned[column] = df_cleaned[column].astype(dtype, errors='ignore')

In [83]:
print(df_cleaned.dtypes)
print(df_cleaned[20000:20005])

location_id                               float64
sensors_id                                float64
location                                 category
datetime         datetime64[ns, Europe/Bucharest]
lat                                       float64
lon                                       float64
parameter                                category
units                                    category
value                                     float64
temp_C                                    float64
dewpoint_C                                float64
slp_hPa                                   float64
wind_dir_deg                              float64
wind_speed_ms                             float64
sky_cover                                category
precip_mm                                 float64
dtype: object
       location_id  sensors_id      location                  datetime  \
20000       9369.0     28602.0  RO0083A-9369 2022-04-15 01:00:00+03:00   
20001       9369.0     28602.0  RO0083

In [84]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Original data typs: {df.dtypes}"
        f"Final data types: {df_cleaned.dtypes}"
    ]
)

In [85]:
# Extract year
df_cleaned["year"] = df_cleaned["datetime"].dt.year

# Count rows with missing air quality data per year
missing_counts = (
    df_cleaned[df_cleaned["location_id"].isna()]
    .groupby("year")
    .size()
)

present_counts = (
    df_cleaned[df_cleaned["location_id"].notna()]
    .groupby("year")
    .size()
)

print("Rows with missing air quality entries per year:")
print(missing_counts)

print("Rows with present air quality entries per year:")
print(present_counts)

Rows with missing air quality entries per year:
year
2020    5893
2021    2082
2022    6693
2023    8684
2024    3286
2025     499
dtype: int64
Rows with present air quality entries per year:
year
2020    2879
2021    6651
2022    2024
2024    5466
2025    5061
dtype: int64


Standard random train/test splitting is not appropriate for time-series data because it assumes that all observations are independent and identically distributed. In reality, air quality and meteorological data are sequential, and each observation is influenced by temporal patterns such as seasonality, daily cycles, or longer-term trends. If the data were shuffled randomly, the model could inadvertently use information from the future to predict the past, creating data leakage and artificially inflating performance metrics.

To address this, the project uses TimeSeriesSplit, which preserves chronological order when creating training and testing datasets. In this approach, the model is always trained on earlier periods and tested on later ones, reflecting the real-world task of forecasting future air quality based on past conditions. Instead of relying on a single cut between train and test sets, TimeSeriesSplit generates multiple rolling splits. This makes it possible to evaluate model performance across different time periods, providing a more robust sense of how well the model generalizes.

This method is particularly valuable for air quality forecasting in Iași, where non-stationary effects such as heating in winter or traffic intensity during specific months can significantly influence pollutant concentrations. By using a time-aware validation strategy, the project ensures that the evaluation reflects these real variations. Ultimately, this leads to more realistic and trustworthy forecasts, which are crucial if the model is to support sustainable urban planning and public health decisions.

In [86]:
# Create time series splits for cross-validation*
tscv = TimeSeriesSplit(n_splits=5)
folds = {}
for i, (train_idx, test_idx) in enumerate(tscv.split(df_cleaned), start=1):
    print(f"Fold {i+1}:")
    print(f"  Train: {train_idx[0]} to {train_idx[-1]}")
    print(f"  Test:  {test_idx[0]} to {test_idx[-1]}")
    folds[f"fold_{i}"] = {
        "X_Train": df_cleaned.iloc[train_idx],
        "X_Test": df_cleaned.iloc[test_idx],
    }

Fold 2:
  Train: 0 to 8202
  Test:  8203 to 16405
Fold 3:
  Train: 0 to 16405
  Test:  16406 to 24608
Fold 4:
  Train: 0 to 24608
  Test:  24609 to 32811
Fold 5:
  Train: 0 to 32811
  Test:  32812 to 41014
Fold 6:
  Train: 0 to 41014
  Test:  41015 to 49217


In [87]:
append_log(
    "outputs/logs/data_cleaning_log.txt",
    [
        f"Temporal split using TimeSeriesSplit with n_splits=5",
    ]
)