1. DATA EXPLORATION AND ANALYSIS



## 2. Imports, Paths, Seed (plan)

**Note:** Make sure to activate the virtual environment before running this notebook:
```bash
source ../.venv/bin/activate
```



In [28]:
# Bootstrap imports from the project root so `from src...` works
import sys
from pathlib import Path
import importlib
import numpy as np

# Detect project ROOT (assumes notebooks are in <ROOT>/notebooks/)
# If we're in notebooks/, go up one level. Otherwise, we're already at ROOT.
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

# Put ROOT (the directory that CONTAINS 'src') on sys.path
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Clear any stale import caches
importlib.invalidate_caches()

import pandas as pd
from src.config import DATA_RAW, FIGURES_DIR, OUTPUTS, DATA_INTERIM
from src.utils import seed_all, get_logger
from src.data_loading import load_raw
from src.detect import detect_columns
from src.validate import parse_sort_assert, boundary_check, save_continuity_hist, snapshot
from src.eda import profile_missingness
from src.policies import count_missing_target, dry_run_feature_fill
from src.plots import hist_target, box_target, seasonal_mean
from src.seasonality import add_time_parts
from src.relations import candidate_exogenous, numeric_columns, corr_with_target, corr_matrix, scatter_xy, hexbin_xy
from src.temporal import plot_autocorr, seasonal_lag_strength, plot_pacf_optional
from src.preprocess_pipeline import build_interim
import json, pathlib

feat_path = pathlib.Path("experiments/features.json")
feat_path.parent.mkdir(parents=True, exist_ok=True)
json.dump(feature_cols, open(feat_path, "w"), indent=2)
print("Wrote", feat_path)



interim = pd.read_csv(DATA_INTERIM / "clean.csv", parse_dates=["datetime"])

from src.cv import make_blocked_folds, mask_for_range
folds = make_blocked_folds(interim.query("split=='train'"), "datetime", n_folds=3, val_days=30)
for f in folds:
    print(f"{f.name}: TRAIN {f.train_start} → {f.train_end} | VAL {f.val_start} → {f.val_end}")

# Set up logger
log = get_logger("eda")




Wrote experiments/features.json
fold1: TRAIN 2010-01-02 00:00:00 → 2013-06-03 02:00:00 | VAL 2013-06-03 03:00:00 → 2013-07-02 03:00:00
fold2: TRAIN 2010-01-02 00:00:00 → 2013-05-04 02:00:00 | VAL 2013-05-04 03:00:00 → 2013-06-02 03:00:00
fold3: TRAIN 2010-01-02 00:00:00 → 2013-04-04 02:00:00 | VAL 2013-04-04 03:00:00 → 2013-05-03 03:00:00


In [2]:
# Load the raw data
train_raw, test_raw, sample_sub = load_raw()
print("Shapes:", train_raw.shape, test_raw.shape, sample_sub.shape)
display(train_raw.head(3))
display(test_raw.head(3))
display(sample_sub.head(3))


Shapes: (30676, 12) (13148, 11) (13148, 2)


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,-1.580878,-1.92225,0.443328,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00,1.448138,-0.732019,-0.522096,
1,2,-1.580878,-2.004228,0.345943,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00,1.448138,-0.732019,-0.522096,
2,3,-1.580878,-1.92225,0.248559,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00,1.448138,-0.732019,-0.522096,


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv
0,30677,1.190496,0.701029,-2.186052,-0.003982,-0.069353,-0.137667,2013-07-02 04:00:00,1.448138,-0.732019,-0.522096
1,30678,1.121211,0.619051,-2.186052,0.031811,-0.069353,-0.137667,2013-07-02 05:00:00,1.448138,-0.732019,-0.522096
2,30679,1.190496,0.783006,-2.186052,0.094398,-0.069353,-0.137667,2013-07-02 06:00:00,1.448138,-0.732019,-0.522096


Unnamed: 0,row ID,pm2.5
0,2013-07-02 4:00:00,14
1,2013-07-02 5:00:00,14
2,2013-07-02 6:00:00,14


In [3]:
# Detect target column and set up variables
train_dt, train_y = detect_columns(train_raw)
test_dt, test_y = detect_columns(test_raw)

print(f"Detected datetime columns: train='{train_dt}', test='{test_dt}'")
print(f"Detected target columns: train='{train_y}', test='{test_y}'")

# Set target variable
TARGET = "pm2_5" if "pm2_5" in train_raw.columns else train_y
TARGET = train_y if train_y in train_raw.columns else "pm2_5"  # alias if you created pm2_5
assert TARGET in train_raw.columns, f"Target column not found in train: {TARGET}"

print(f"Using target column: {TARGET}")

# Set up train and test variables
train = train_raw
test = test_raw


Detected datetime columns: train='datetime', test='datetime'
Detected target columns: train='pm2.5', test='None'
Using target column: pm2.5


## 3. Load raw data (read-only)



In [4]:
train_raw, test_raw, sample_sub = load_raw()
print("Shapes:", train_raw.shape, test_raw.shape, sample_sub.shape)
display(train_raw.head(3))
display(test_raw.head(3))
display(sample_sub.head(3))


Shapes: (30676, 12) (13148, 11) (13148, 2)


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,-1.580878,-1.92225,0.443328,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00,1.448138,-0.732019,-0.522096,
1,2,-1.580878,-2.004228,0.345943,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00,1.448138,-0.732019,-0.522096,
2,3,-1.580878,-1.92225,0.248559,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00,1.448138,-0.732019,-0.522096,


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv
0,30677,1.190496,0.701029,-2.186052,-0.003982,-0.069353,-0.137667,2013-07-02 04:00:00,1.448138,-0.732019,-0.522096
1,30678,1.121211,0.619051,-2.186052,0.031811,-0.069353,-0.137667,2013-07-02 05:00:00,1.448138,-0.732019,-0.522096
2,30679,1.190496,0.783006,-2.186052,0.094398,-0.069353,-0.137667,2013-07-02 06:00:00,1.448138,-0.732019,-0.522096


Unnamed: 0,row ID,pm2.5
0,2013-07-02 4:00:00,14
1,2013-07-02 5:00:00,14
2,2013-07-02 6:00:00,14


In [5]:
train_dt, train_y = detect_columns(train_raw)
test_dt, _ = detect_columns(test_raw)

print("Detected:")
print("  train datetime:", train_dt)
print("  train target  :", train_y)
print("  test  datetime:", test_dt)

assert train_dt and train_y and test_dt, "Failed to detect dt/target columns."

train = parse_sort_assert(train_raw, train_dt)
test  = parse_sort_assert(test_raw, test_dt)

log.info("Parsed & sorted. Train[end]=%s | Test[start]=%s", train[train_dt].iloc[-1], test[test_dt].iloc[0])


2025-09-21 16:42:35,575 | INFO | Parsed & sorted. Train[end]=2013-07-02 03:00:00 | Test[start]=2013-07-02 04:00:00


Detected:
  train datetime: datetime
  train target  : pm2.5
  test  datetime: datetime


In [6]:
train_snap = snapshot(train, train_dt, train_y)
test_snap  = snapshot(test,  test_dt)

print("TRAIN snapshot:", train_snap)
print("TEST  snapshot:", test_snap)

ok = boundary_check(train[train_dt].iloc[-1], test[test_dt].iloc[0])
print("Boundary:", "PASS ✅" if ok else "FAIL ❌")

# Show last/first 3 rows around the split
display(train.tail(3)[[train_dt, train_y]])
display(test.head(3)[[test_dt]])


2025-09-21 16:42:35,580 | INFO | Boundary check (last train + 1h == first test): PASS ✅


TRAIN snapshot: {'n_rows': 30676, 'n_cols': 12, 'first_ts': '2010-01-01T00:00:00', 'last_ts': '2013-07-02T03:00:00', 'columns': ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5'], 'target_non_null': 28755, 'target_null': 1921}
TEST  snapshot: {'n_rows': 13148, 'n_cols': 11, 'first_ts': '2013-07-02T04:00:00', 'last_ts': '2014-12-31T23:00:00', 'columns': ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv']}
Boundary: PASS ✅


Unnamed: 0,datetime,pm2.5
30673,2013-07-02 01:00:00,32.0
30674,2013-07-02 02:00:00,19.0
30675,2013-07-02 03:00:00,18.0


Unnamed: 0,datetime
0,2013-07-02 04:00:00
1,2013-07-02 05:00:00
2,2013-07-02 06:00:00


## 4. Datetime parsing & continuity checks



In [7]:
save_continuity_hist(train, train_dt, "train")
save_continuity_hist(test,  test_dt,  "test")


2025-09-21 16:42:35,644 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_continuity_train.png
2025-09-21 16:42:35,681 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_continuity_test.png


PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_continuity_test.png')

## 5. Missingness analysis
### 5.1 Missingness profiles + figures



In [8]:
# Train missingness
miss_train = profile_missingness(train, tag="train", top_n=20)
display(miss_train.table.head(20))
print("Saved figure:", miss_train.figure_path)

# Test missingness
miss_test = profile_missingness(test, tag="test", top_n=20)
display(miss_test.table.head(20))
print("Saved figure:", miss_test.figure_path)


2025-09-21 16:42:35,797 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_missing_train.png


Unnamed: 0,na_count,na_pct
pm2.5,1921,6.262225
No,0,0.0
DEWP,0,0.0
TEMP,0,0.0
PRES,0,0.0
Iws,0,0.0
Is,0,0.0
Ir,0,0.0
datetime,0,0.0
cbwd_NW,0,0.0


2025-09-21 16:42:35,844 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_missing_test.png


Saved figure: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_missing_train.png


Unnamed: 0,na_count,na_pct
No,0,0.0
DEWP,0,0.0
TEMP,0,0.0
PRES,0,0.0
Iws,0,0.0
Is,0,0.0
Ir,0,0.0
datetime,0,0.0
cbwd_NW,0,0.0
cbwd_SE,0,0.0


Saved figure: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_missing_test.png


### 5.2 Drop missing target and ffill/bfill features as plan

In [9]:
# Count rows with missing target in TRAIN (to be DROPPED)
n_miss_target = count_missing_target(train, target_col=train_y)
print(f"Rows with missing target in train: {n_miss_target}")

# Dry-run the feature fill to see reduction in NaNs (does not mutate)
dry_train = dry_run_feature_fill(train.drop(columns=[train_y], errors="ignore"))
dry_test  = dry_run_feature_fill(test.copy())
print("TRAIN NaNs before/after ffill+bfill:", dry_train)
print("TEST  NaNs before/after ffill+bfill:", dry_test)


Rows with missing target in train: 1921
TRAIN NaNs before/after ffill+bfill: {'total_nan_before': 0, 'total_nan_after_ffill_bfill': 0}
TEST  NaNs before/after ffill+bfill: {'total_nan_before': 0, 'total_nan_after_ffill_bfill': 0}


### 5.4 Quick integrity prints

In [10]:
# Which columns have any NaNs after ffill/bfill (dry run)?
cols_train_nan = miss_train.table.query("na_count > 0").index.tolist()
cols_test_nan  = miss_test.table.query("na_count > 0").index.tolist()
print("Columns with NaNs (train):", cols_train_nan[:20])
print("Columns with NaNs (test) :", cols_test_nan[:20])


Columns with NaNs (train): ['pm2.5']
Columns with NaNs (test) : []


In [11]:
# Quick fix: Import OUTPUTS if not already available
try:
    OUTPUTS
    print("OUTPUTS is already available")
except NameError:
    from src.config import OUTPUTS
    print("OUTPUTS imported successfully")
    print(f"OUTPUTS points to: {OUTPUTS}")


OUTPUTS is already available


## 6. Target distribution & outliers



## 7. Seasonality (hour/day/week/month)



### 7.1 Standardize target name in-memory for plotting

In [12]:
# Create pm2_5 alias if needed (no file write yet)
if "pm2_5" not in train.columns:
    if "pm2.5" in train.columns:
        train["pm2_5"] = train["pm2.5"].astype(float)
    else:
        raise ValueError("Could not find target column 'pm2.5' to alias as 'pm2_5'.")

# Quick sanity
display(train[["pm2_5"]].head(3))


Unnamed: 0,pm2_5
0,
1,
2,


### 7.2 Target distribution plots

In [13]:
p1 = hist_target(train["pm2_5"], "Target distribution: pm2_5 (train)", "eda_target_hist.png")
p2 = box_target(train["pm2_5"], "Target boxplot: pm2_5 (train)", "eda_target_box.png")
print("Saved:", p1, p2)


2025-09-21 16:42:35,943 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_target_hist.png
2025-09-21 16:42:35,977 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_target_box.png


Saved: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_target_hist.png /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_target_box.png


### 7.3 Add time parts (hour/month) for seasonal plots (no cyclic yet)

In [14]:
train_tp = add_time_parts(train, dt_col=train_dt)

p_hour = seasonal_mean(train_tp, by="hour",  target="pm2_5",
                       title="Seasonality: Hour-of-day mean pm2_5 (train)",
                       fname="eda_seasonality_hour.png")

p_month = seasonal_mean(train_tp, by="month", target="pm2_5",
                        title="Seasonality: Month mean pm2_5 (train)",
                        fname="eda_seasonality_month.png")

print("Saved:", p_hour, p_month)


2025-09-21 16:42:36,089 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_seasonality_hour.png
2025-09-21 16:42:36,139 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_seasonality_month.png


Saved: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_seasonality_hour.png /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_seasonality_month.png


### 7.4 Lock decision: train on log1p(pm2_5)

In [15]:
# Rationale (concise): pm2_5 is right-skewed with heavy tails → log1p stabilizes variance and reduces the influence of outliers.
train["pm2_5_log1p"] = np.log1p(train["pm2_5"])
print("Preview of transformed target:")
display(train[["pm2_5", "pm2_5_log1p"]].head(3))


Preview of transformed target:


Unnamed: 0,pm2_5,pm2_5_log1p
0,,
1,,
2,,


## 8. Weather/exogenous relationships
### 8.1 Choose candidate feature set



In [16]:
# Good starting set for this dataset (auto-filtered to what's present)
cand = candidate_exogenous(train)
print("Candidate exogenous features:", cand)

# If you want to include all numeric cols except id/time/target/one-hots:
EXCLUDE = {train_dt, TARGET}
EXCLUDE |= {c for c in train.columns if c.startswith("cbwd")}  # wind dir one-hots (categorical)
num_all = numeric_columns(train, exclude=EXCLUDE)
print("All numeric (excl time/target/one-hots):", num_all[:12], "... (n=", len(num_all), ")")


Candidate exogenous features: ['TEMP', 'DEWP', 'PRES', 'Iws', 'Is', 'Ir']
All numeric (excl time/target/one-hots): ['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'pm2_5', 'pm2_5_log1p'] ... (n= 9 )


### 8.2 Correlation with target (Pearson & Spearman)

In [17]:
# Pearson (linear)
pear = corr_with_target(train, TARGET, features=(cand if cand else num_all), method="pearson")
display(pear)

# Spearman (rank/monotonic)
spear = corr_with_target(train, TARGET, features=(cand if cand else num_all), method="spearman")
display(spear)

# Save quick CSVs (optional, helps later in report)
pear.to_csv(OUTPUTS / "pearson_with_target.csv", index=False)
spear.to_csv(OUTPUTS / "spearman_with_target.csv", index=False)


Unnamed: 0,feature,corr,abs_corr
0,Iws,-0.26025,0.26025
1,DEWP,0.218187,0.218187
2,PRES,-0.107773,0.107773
3,Ir,-0.052288,0.052288
4,TEMP,-0.039601,0.039601
5,Is,0.022279,0.022279


Unnamed: 0,feature,corr,abs_corr
0,Iws,-0.375004,0.375004
1,DEWP,0.348564,0.348564
2,PRES,-0.196648,0.196648
3,TEMP,0.063766,0.063766
4,Is,0.048279,0.048279
5,Ir,0.001529,0.001529


### 8.3 Feature collinearity (optional but useful)

In [18]:
cols_for_matrix = cand if cand else num_all[:20]  # keep it readable
cmat = corr_matrix(train.dropna(subset=[TARGET]), cols_for_matrix, method="pearson")
display(cmat.round(3))


Unnamed: 0,TEMP,DEWP,PRES,Iws,Is,Ir
TEMP,1.0,0.829,-0.825,-0.141,-0.102,0.05
DEWP,0.829,1.0,-0.77,-0.285,-0.037,0.127
PRES,-0.825,-0.77,1.0,0.177,0.081,-0.076
Iws,-0.141,-0.285,0.177,1.0,0.021,-0.006
Is,-0.102,-0.037,0.081,0.021,1.0,-0.011
Ir,0.05,0.127,-0.076,-0.006,-0.011,1.0


### 8.4 Scatter/hexbin plots for the top features

In [19]:
# Pick top 3–4 by absolute Pearson corr (fallback to Spearman if needed)
top_feats = pear.dropna().head(4)["feature"].tolist() if pear.dropna().shape[0] else []
print("Top features to visualize:", top_feats)

paths = []
for feat in top_feats:
    paths.append(
        scatter_xy(
            train, x=feat, y=TARGET,
            fname=f"eda_scatter_{feat.lower()}.png",
            title=f"{TARGET} vs {feat}"
        )
    )
    # For very dense features like TEMP or PRES, also add a hexbin:
    paths.append(
        hexbin_xy(
            train, x=feat, y=TARGET,
            fname=f"eda_hex_{feat.lower()}.png",
            title=f"{TARGET} vs {feat} (hexbin)"
        )
    )

print("Saved:", paths)


2025-09-21 16:42:36,246 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_iws.png
2025-09-21 16:42:36,319 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_iws.png


Top features to visualize: ['Iws', 'DEWP', 'PRES', 'Ir']


2025-09-21 16:42:36,371 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_dewp.png
2025-09-21 16:42:36,450 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_dewp.png
2025-09-21 16:42:36,497 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_pres.png
2025-09-21 16:42:36,573 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_pres.png
2025-09-21 16:42:36,617 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_ir.png
2025-09-21 16:42:36,687 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_ir.png


Saved: [PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_iws.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_iws.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_dewp.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_dewp.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_pres.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_hex_pres.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_scatter_ir.png'), PosixPath('/Users/testsolutions/Documents/school/year3/term2/time-seri

## 9. Temporal dependence (ACF/PACF)
### 9.1 Autocorrelation up to 400h


In [20]:
ac_path = plot_autocorr(
    train[TARGET],
    max_lag=400,
    fname="eda_autocorr_pm2_5.png",
    title="Autocorrelation of pm2_5 (train) up to 400h",
)
print("Saved:", ac_path)


2025-09-21 16:42:36,820 | INFO | Saved /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_autocorr_pm2_5.png


Saved: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/figures/eda_autocorr_pm2_5.png


### 9.2 Quantify seasonal lags

In [21]:
lags = [24, 48, 72, 168, 336]
strength = seasonal_lag_strength(train[TARGET], periods=lags)
print("Autocorr at key lags:", strength)


Autocorr at key lags: {24: 0.3799216730277738, 48: 0.16127758109060913, 72: 0.08847874716128508, 168: 0.014484060435086314, 336: 0.03285973703086032}


### 9.3 PACF (if statsmodels is available)

In [22]:
pacf_path = plot_pacf_optional(train[TARGET], max_lag=60, fname="eda_pacf_pm2_5.png")
print("PACF saved:", pacf_path)


2025-09-21 16:42:36,830 | INFO | statsmodels not available; skipping PACF.


PACF saved: None


### 9.4 Lock lookback candidates

In [23]:
# Heuristic decision based on ACF: keep 72, 168, 336
LOOKBACK_CANDIDATES = [72, 168, 336]
LOOKBACK_CANDIDATES


[72, 168, 336]

## 10. Preprocessing decisions (rationale)



### 10.1 Run the preprocessing pipeline and save files

In [24]:
train_raw, test_raw, _ = load_raw()
interim, meta = build_interim(train_raw, test_raw, save_csv=True)

print("Wrote:", DATA_INTERIM / "clean.csv")
print("Wrote:", DATA_INTERIM / "clean_meta.json")
print("Meta:", meta)
display(interim.head())


Wrote: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/data/interim/clean.csv
Wrote: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/data/interim/clean_meta.json
Meta: CleanMeta(n_train_rows=28755, n_test_rows=13148, dropped_train_missing_target=1921, train_time_min='2010-01-02T00:00:00', train_time_max='2013-07-02T03:00:00', test_time_min='2013-07-02T04:00:00', test_time_max='2014-12-31T23:00:00', columns=['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'pm2.5', 'pm2_5', 'hour', 'day_of_week', 'month', 'day_of_year', 'sin_hour', 'cos_hour', 'sin_doy', 'cos_doy', 'split'])


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,...,pm2_5,hour,day_of_week,month,day_of_year,sin_hour,cos_hour,sin_doy,cos_doy,split
0,25,-1.234456,-1.348408,0.345943,-0.441894,-0.069353,-0.137667,2010-01-02 00:00:00,-0.690542,1.366085,...,129.0,0,5,1,2,0.0,1.0,0.034328,0.999411,train
1,26,-1.165172,-1.348408,0.345943,-0.424097,-0.069353,-0.137667,2010-01-02 01:00:00,-0.690542,1.366085,...,148.0,1,5,1,2,0.258819,0.965926,0.034328,0.999411,train
2,27,-0.888034,-1.430386,0.443328,-0.406301,-0.069353,-0.137667,2010-01-02 02:00:00,-0.690542,1.366085,...,159.0,2,5,1,2,0.5,0.866025,0.034328,0.999411,train
3,28,-0.610897,-1.430386,0.540712,-0.370508,1.245803,-0.137667,2010-01-02 03:00:00,-0.690542,1.366085,...,181.0,3,5,1,2,0.707107,0.707107,0.034328,0.999411,train
4,29,-0.610897,-1.430386,0.540712,-0.352712,2.560959,-0.137667,2010-01-02 04:00:00,-0.690542,1.366085,...,138.0,4,5,1,2,0.866025,0.5,0.034328,0.999411,train


### 6.2 Quick sanity checks

In [25]:
# 1) No missing target in train split
assert interim.query("split=='train'")["pm2_5"].isna().sum() == 0

# 2) Datetime monotonicity holds within each split
for s in ["train", "test"]:
    dts = interim.loc[interim["split"]==s, "datetime"]
    assert dts.is_monotonic_increasing and dts.is_unique

# 3) Time/cyclic features present
for c in ["hour","day_of_week","month","day_of_year","sin_hour","cos_hour","sin_doy","cos_doy"]:
    assert c in interim.columns, f"Missing feature {c}"

# 4) Train/test ranges look sensible
tr = interim[interim["split"]=="train"]
te = interim[interim["split"]=="test"]
print("Train range:", tr["datetime"].min(), "→", tr["datetime"].max(), "| rows:", len(tr))
print("Test  range:", te["datetime"].min(), "→", te["datetime"].max(), "| rows:", len(te))


Train range: 2010-01-02 00:00:00 → 2013-07-02 03:00:00 | rows: 28755
Test  range: 2013-07-02 04:00:00 → 2014-12-31 23:00:00 | rows: 13148


## 11. Feature list (contract)



In [27]:
df_tr = interim.query("split=='train'").copy()
EXCLUDE = {"datetime", "split", "pm2.5", "pm2_5"}  # exclude targets/time/split
feature_cols = [c for c in df_tr.columns if c not in EXCLUDE]
print("n_features:", len(feature_cols))
print(feature_cols)


n_features: 18
['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'hour', 'day_of_week', 'month', 'day_of_year', 'sin_hour', 'cos_hour', 'sin_doy', 'cos_doy']


In [29]:
from src.baselines import evaluate_baselines
TARGET = "pm2_5"
base_df = evaluate_baselines(interim.query("split=='train'"), "datetime", TARGET, folds, seasonal_lag=24)
display(base_df)

# Save for the report
from src.config import OUTPUTS
OUTPUTS.mkdir(parents=True, exist_ok=True)
base_df.to_csv(OUTPUTS / "baselines_cv.csv", index=False)
print("Wrote", OUTPUTS / "baselines_cv.csv")


Unnamed: 0,fold,rmse_naive,rmse_seasonal24,val_rows
0,fold1,23.169255,91.354681,695
1,fold2,19.757798,69.452671,680
2,fold3,12.996336,59.615749,693


Wrote /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/baselines_cv.csv


## 12. Artifacts to write (clean parquet, metadata)