In [1]:
import sales_forecasting.config as config
import sales_forecasting.models.train as train_module

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np


X, y = (
    np.load(config.PROCESSED_DATA_DIR / "X.npy"),
    np.load(config.PROCESSED_DATA_DIR / "y.npy"),
)

In [None]:
# Quick summary of dataset sizes and basic stats (uses existing X, y)
timesteps, num_nodes, num_features = X.shape
print(
    f"Interpreting X as (timesteps, num_nodes, num_features): X shape: {X.shape}  -> timesteps={timesteps}, num_nodes={num_nodes}, num_features={num_features}"
)
print(f"y shape: {y.shape}")

# alignment check: expect y to be (timesteps, num_nodes)
if y.shape == (timesteps, num_nodes):
    print("y aligns with X (per-timestep per-node targets).")
else:
    print("y does NOT align with X. Expected shape (timesteps, num_nodes).")

# memory footprint
print(f"Memory: X = {X.nbytes / 1024**2:.2f} MB, y = {y.nbytes / 1024**2:.2f} MB")

# sparsity / missing values
x_zero_frac = (X == 0).sum() / X.size
y_zero_frac = (y == 0).sum() / y.size
print(f"Zero fraction: X = {x_zero_frac:.3%}, y = {y_zero_frac:.3%}")
print(f"Contains NaN: X={np.isnan(X).any()}, y={np.isnan(y).any()}")
print(f"Contains inf: X={np.isinf(X).any()}, y={np.isinf(y).any()}")

# per-feature summary across all timesteps & nodes
feat_min = X.min(axis=(0, 1))
feat_mean = X.mean(axis=(0, 1))
feat_max = X.max(axis=(0, 1))
print("Per-feature (min, mean, max):")
for i, (mn, mu, mx) in enumerate(zip(feat_min, feat_mean, feat_max)):
    print(f"  feature_{i}: {mn:.3f}, {mu:.3f}, {mx:.3f}")

# example per-node statistics (average over time and features)
node_means = X.mean(axis=(0, 2))  # shape == (num_nodes,)
print(f"Per-node mean (first 10 nodes): {node_means[:10]}")
print(
    f"Per-node mean (overall): min={node_means.min():.3f}, mean={node_means.mean():.3f}, max={node_means.max():.3f}"
)

Interpreting X as (timesteps, num_nodes, num_features): X shape: (220, 41, 4)  -> timesteps=220, num_nodes=41, num_features=4
y shape: (220, 41)
y aligns with X (per-timestep per-node targets).
Memory: X = 0.28 MB, y = 0.07 MB
Zero fraction: X = 46.716%, y = 46.220%
Contains NaN: X=False, y=False
Contains inf: X=False, y=False
Per-feature (min, mean, max):
  feature_0: 0.000, 849.287, 18859.000
  feature_1: 0.000, 848.776, 19617.000
  feature_2: 0.000, 848.499, 18517.000
  feature_3: 0.000, 859.555, 29251.000
Per-node mean (first 10 nodes): [ 387.09137841 6794.56170455 1007.55857739 4775.58838444 7491.37565298
 2828.54197132  220.49869888   88.64904003  302.24630682  554.20202273]
Per-node mean (overall): min=0.001, mean=851.529, max=7491.376


In [None]:
# assuming timesteps and num_nodes are already defined in the notebook
window = 5

usable_timesteps = max(
    0, timesteps - window
)  # number of target timesteps that have 5 previous steps
samples_full_graph = usable_timesteps  # one sample per timestep (all nodes together)
samples_per_node = usable_timesteps * num_nodes  # one sample per (timestep, node)

print(f"timesteps = {timesteps}, window = {window}")
print(f"Usable target timesteps = {usable_timesteps}")
print(f"Samples (one sample = full graph per timestep): {samples_full_graph}")
print(f"Samples (one sample = per-node per-timestep): {samples_per_node}")

timesteps = 220, window = 5
Usable target timesteps = 215
Samples (one sample = full graph per timestep): 215
Samples (one sample = per-node per-timestep): 8815
