In [None]:
# Section 1: (Re)import & Setup: safe copies, logging and helper reload
import os
import sys
import json
import pandas as pd
import numpy as np
import subprocess
from datetime import datetime

# Use the existing train_df if it is in the main notebook's globals; otherwise, try load the processed CSV
try:
    train_df
except NameError:
    TRAIN_PATH = r"D:\MSE\5. Data Mining\railway-delay\data\processed\merged_train_data.csv"
    if os.path.exists(TRAIN_PATH):
        train_df = pd.read_csv(TRAIN_PATH, parse_dates=True)
    else:
        # Create an empty sample to run unit tests later
        train_df = pd.DataFrame()

# Create a working copy for debug so we don't affect global state accidentally
df = train_df.copy()
print('Working df shape:', df.shape)
print('Columns:', df.columns.tolist())

# Redefine / show our helper definitions (we'll override later with robust versions if needed)

# Minimal logging
def _log(msg):
    print(f"[fix_feature_engineering] {msg}")

_log('Setup done')


In [None]:
# Section 2: Inspect route/time columns and reproduce KeyError (safe diagnostics)

# Candidate route columns
route_candidates = ['STATION_ID', 'ROUTE_ID', 'TRAIN_ID', 'TRAIN_NUMBER', 'ROUTE', 'STATION']
route_col = None
for c in route_candidates:
    if c in df.columns:
        route_col = c
        break

_log(f'route detected: {route_col}')
_log(f"index names: {df.index.names}")

# Candidate schedule/time column search
sched_candidates = ['SCHEDULED_DT', 'SCHEDULED_DATE', 'SCHEDULED_TIME', 'SCHEDULED']
schedule_col = None
for c in sched_candidates:
    if c in df.columns:
        schedule_col = c
        break

_log(f'schedule column: {schedule_col}')

# Show sample values for route_col and schedule_col
if route_col is not None:
    _log(f"Unique sample for {route_col}: {df[route_col].unique()[:5] if route_col in df.columns else 'Route not in columns'}")
else:
    _log('No route column present')

if schedule_col is not None:
    _log(f"Schedule dtype: {df[schedule_col].dtype}")
else:
    _log('No schedule column present')

# Try to reproduce the error in a try/except block using same logic as original code
try:
    # mimic the original approach: set index
    if schedule_col is None:
        raise KeyError('SCHEDULED_DT missing for demonstration')
    df_test = df.copy()
    df_test[schedule_col] = pd.to_datetime(df_test[schedule_col], errors='coerce')
    df_test.set_index(schedule_col, inplace=True)
    # If route_col is None, this should raise if the code mistakenly attempts to groupby None
    _log('Attempting original grouping approach')
    df_test['ROLLING_MEAN_DELAY_7D'] = df_test.groupby(route_col)['TARGET'].transform(lambda x: x.rolling('7D').mean())
    _log('Original approach succeeded (unexpected)')
except Exception as e:
    _log('Original approach produced exception:')
    _log(str(e))
    _log('Diagnostics:')
    _log('df columns: ' + str(df.columns.tolist()))
    _log('df index name: ' + str(df.index.name))
    _log('route_col used: ' + str(route_col))
    if route_col is not None and route_col not in df.columns:
        _log(f"Note: route_col {route_col} not present in df.columns; maybe it is an index? df.index.names={df.index.names}")


In [None]:
# Section 3: Robust compute_rolling_features (fixed implementation)

# We'll implement a robust rolling function that handles:
# - schedule_col missing -> skip
# - route_col present as column -> groupby column + time-based rolling
# - route_col present as index level -> groupby(level=route_col)
# - route_col not present -> global rolling on time index


def compute_rolling_features_safe(df_in, target_col='TARGET', schedule_col='SCHEDULED_DT', candidates=None, w7='7D', w30='30D'):
    df = df_in.copy()
    if candidates is None:
        candidates = ['STATION_ID', 'ROUTE_ID', 'TRAIN_ID', 'TRAIN_NUMBER', 'ROUTE', 'STATION']

    def _get_route_col_local(df):
        for c in candidates:
            if c in df.columns:
                return c
        return None

    route_col_local = _get_route_col_local(df)

    # Ensure schedule_col exists and is datetime
    if schedule_col not in df.columns:
        _log(f"{schedule_col} missing; skipping rolling features and filling with median")
        df['ROLLING_MEAN_DELAY_7D'] = df[target_col].median() if target_col in df.columns else np.nan
        df['ROLLING_MEAN_DELAY_30D'] = df[target_col].median() if target_col in df.columns else np.nan
        return df

    df[schedule_col] = pd.to_datetime(df[schedule_col], errors='coerce')

    if route_col_local is not None and route_col_local in df.columns:
        # We can do groupby route_col + rolling on time index
        df = df.sort_values([route_col_local, schedule_col])
        try:
            # Ensure index is schedule
            tmp = df.set_index(schedule_col)
            tmp['ROLLING_MEAN_DELAY_7D'] = tmp.groupby(route_col_local)[target_col].transform(lambda x: x.rolling(w7).mean())
            tmp['ROLLING_MEAN_DELAY_30D'] = tmp.groupby(route_col_local)[target_col].transform(lambda x: x.rolling(w30).mean())
            df['ROLLING_MEAN_DELAY_7D'] = tmp['ROLLING_MEAN_DELAY_7D'].values
            df['ROLLING_MEAN_DELAY_30D'] = tmp['ROLLING_MEAN_DELAY_30D'].values
        except Exception as e:
            _log('groupby-rolling by route_col raised exception, falling back to per-group manual compute: ' + str(e))
            df['ROLLING_MEAN_DELAY_7D'] = np.nan
            df['ROLLING_MEAN_DELAY_30D'] = np.nan
            for name, group in df.groupby(route_col_local):
                try:
                    sub = group.sort_values(schedule_col)
                    sub_index = sub.index
                    sub_indexed = sub.set_index(schedule_col)
                    sub_indexed['ROLLING_MEAN_DELAY_7D'] = sub_indexed[target_col].rolling(w7).mean()
                    sub_indexed['ROLLING_MEAN_DELAY_30D'] = sub_indexed[target_col].rolling(w30).mean()
                    df.loc[sub_index, 'ROLLING_MEAN_DELAY_7D'] = sub_indexed['ROLLING_MEAN_DELAY_7D'].values
                    df.loc[sub_index, 'ROLLING_MEAN_DELAY_30D'] = sub_indexed['ROLLING_MEAN_DELAY_30D'].values
                except Exception:
                    continue
    else:
        # No route column in columns; try route as index level
        idx_names = df.index.names if df.index is not None else []
        route_index_level = None
        for c in candidates:
            if c in idx_names:
                route_index_level = c
                break
        df = df.sort_values(schedule_col)
        df_indexed = df.set_index(schedule_col)
        if route_index_level is not None:
            try:
                df_indexed['ROLLING_MEAN_DELAY_7D'] = df_indexed.groupby(level=route_index_level)[target_col].transform(lambda x: x.rolling(w7).mean())
                df_indexed['ROLLING_MEAN_DELAY_30D'] = df_indexed.groupby(level=route_index_level)[target_col].transform(lambda x: x.rolling(w30).mean())
                df['ROLLING_MEAN_DELAY_7D'] = df_indexed['ROLLING_MEAN_DELAY_7D'].values
                df['ROLLING_MEAN_DELAY_30D'] = df_indexed['ROLLING_MEAN_DELAY_30D'].values
            except Exception as e:
                _log('index-level groupby failing: ' + str(e) + ' -> fallback to global time rolling')
                df_indexed['ROLLING_MEAN_DELAY_7D'] = df_indexed[target_col].rolling(w7).mean()
                df_indexed['ROLLING_MEAN_DELAY_30D'] = df_indexed[target_col].rolling(w30).mean()
                df['ROLLING_MEAN_DELAY_7D'] = df_indexed['ROLLING_MEAN_DELAY_7D'].values
                df['ROLLING_MEAN_DELAY_30D'] = df_indexed['ROLLING_MEAN_DELAY_30D'].values
        else:
            # global rolling
            df_indexed['ROLLING_MEAN_DELAY_7D'] = df_indexed[target_col].rolling(w7).mean()
            df_indexed['ROLLING_MEAN_DELAY_30D'] = df_indexed[target_col].rolling(w30).mean()
            df['ROLLING_MEAN_DELAY_7D'] = df_indexed['ROLLING_MEAN_DELAY_7D'].values
            df['ROLLING_MEAN_DELAY_30D'] = df_indexed['ROLLING_MEAN_DELAY_30D'].values

    # Fill NaNs with group median if possible or global median fallback
    try:
        if route_col_local is not None and route_col_local in df.columns:
            df['ROLLING_MEAN_DELAY_7D'] = df['ROLLING_MEAN_DELAY_7D'].fillna(df.groupby(route_col_local)[target_col].transform('median'))
            df['ROLLING_MEAN_DELAY_30D'] = df['ROLLING_MEAN_DELAY_30D'].fillna(df.groupby(route_col_local)[target_col].transform('median'))
        else:
            df['ROLLING_MEAN_DELAY_7D'] = df['ROLLING_MEAN_DELAY_7D'].fillna(df[target_col].median() if target_col in df.columns else np.nan)
            df['ROLLING_MEAN_DELAY_30D'] = df['ROLLING_MEAN_DELAY_30D'].fillna(df[target_col].median() if target_col in df.columns else np.nan)
    except Exception:
        df['ROLLING_MEAN_DELAY_7D'] = df['ROLLING_MEAN_DELAY_7D'].fillna(df[target_col].median() if target_col in df.columns else np.nan)
        df['ROLLING_MEAN_DELAY_30D'] = df['ROLLING_MEAN_DELAY_30D'].fillna(df[target_col].median() if target_col in df.columns else np.nan)

    return df

_log('compute_rolling_features_safe defined')


In [None]:
# Section 4: Improve compute_prev_delay (safe grouping + index handling)

def compute_prev_delay_safe(df_in, target_col='TARGET', schedule_col='SCHEDULED_DT', candidates=None, default_fill=-1):
    df = df_in.copy()
    if candidates is None:
        candidates = ['TRAIN_ID','STATION_ID','ROUTE_ID','TRAIN_NUMBER','ROUTE','STATION']

    def _get_route_local(df):
        for c in candidates:
            if c in df.columns:
                return c
        return None

    route_local = _get_route_local(df)

    if schedule_col not in df.columns:
        _log(f"{schedule_col} missing; PREV_DELAY will be set to default {default_fill}")
        df['PREV_DELAY'] = default_fill
        return df

    # Ensure schedule_col is datetime
    df[schedule_col] = pd.to_datetime(df[schedule_col], errors='coerce')

    # Sort by route and schedule
    if route_local is not None and route_local in df.columns:
        df = df.sort_values([route_local, schedule_col])
        try:
            df['PREV_DELAY'] = df.groupby(route_local)[target_col].shift(1)
        except Exception as e:
            _log('groupby shift failed with exception: ' + str(e) + ' -> Attempting per-group fallback')
            df['PREV_DELAY'] = np.nan
            for name, group in df.groupby(route_local):
                sub = group.sort_values(schedule_col)
                df.loc[sub.index, 'PREV_DELAY'] = sub[target_col].shift(1).values
    else:
        # route not present in columns, check if index has a route level
        idx_names = df.index.names if df.index is not None else []
        route_index_level = None
        for c in candidates:
            if c in idx_names:
                route_index_level = c
                break
        if route_index_level is not None:
            df = df.sort_values(schedule_col)
            try:
                df['PREV_DELAY'] = df.groupby(level=route_index_level)[target_col].shift(1)
            except Exception as e:
                _log('groupby by index failed: ' + str(e) + ' -> fallback to global shift')
                df = df.sort_values(schedule_col)
                df['PREV_DELAY'] = df[target_col].shift(1)
        else:
            # no route info -> global shift by schedule
            df = df.sort_values(schedule_col)
            df['PREV_DELAY'] = df[target_col].shift(1)

    # Fill PREV_DELAY NaNs with median if possible
    try:
        if route_local is not None and route_local in df.columns:
            df['PREV_DELAY'] = df['PREV_DELAY'].fillna(df.groupby(route_local)[target_col].transform('median'))
        else:
            df['PREV_DELAY'] = df['PREV_DELAY'].fillna(df[target_col].median() if target_col in df.columns else default_fill)
    except Exception:
        df['PREV_DELAY'] = df['PREV_DELAY'].fillna(default_fill)

    # Cast to numeric
    df['PREV_DELAY'] = pd.to_numeric(df['PREV_DELAY'], errors='coerce').fillna(default_fill)
    return df

_log('compute_prev_delay_safe defined')

In [None]:
# Section 5: Apply fixes to train_df & validate columns present

# We'll try to use our compute_prev_delay_safe & compute_rolling_features_safe on df (working copy)

_log('Applying compute_prev_delay_safe...')
try:
    df = compute_prev_delay_safe(df)
    _log('PREV_DELAY stats:')
    _log(str(df['PREV_DELAY'].describe()))
except Exception as e:
    _log('compute_prev_delay_safe failed: ' + str(e))

_log('Applying compute_rolling_features_safe...')
try:
    df = compute_rolling_features_safe(df)
    _log('ROLLING_MEAN_DELAY_7D stats:')
    _log(str(df['ROLLING_MEAN_DELAY_7D'].describe()))
    _log('ROLLING_MEAN_DELAY_30D stats:')
    _log(str(df['ROLLING_MEAN_DELAY_30D'].describe()))
except Exception as e:
    _log('compute_rolling_features_safe failed: ' + str(e))

# sanity checks
assert 'PREV_DELAY' in df.columns, 'PREV_DELAY missing'
assert 'ROLLING_MEAN_DELAY_7D' in df.columns, 'ROLLING_MEAN_DELAY_7D missing'
assert 'ROLLING_MEAN_DELAY_30D' in df.columns, 'ROLLING_MEAN_DELAY_30D missing'

# sample display for routes (if present), else global head
if any(c in df.columns for c in ['TRAIN_ID', 'STATION_ID', 'ROUTE_ID']):
    rc = next((c for c in ['TRAIN_ID', 'STATION_ID', 'ROUTE_ID'] if c in df.columns), None)
    _log('show sample group for: ' + rc)
    display(df.loc[df[rc].notna()].head(8)[['TARGET', 'PREV_DELAY', 'ROLLING_MEAN_DELAY_7D', 'ROLLING_MEAN_DELAY_30D']])
else:
    display(df.head(10)[['TARGET', 'PREV_DELAY', 'ROLLING_MEAN_DELAY_7D', 'ROLLING_MEAN_DELAY_30D']])

_log('Application done')

In [None]:
# Section 7: Edge cases & fallback strategies for missing columns or index-based grouping
# Additional checks and safer conversions

# Case: route column present but due to dtype differences groupby raised an error. Cast route columns to str to avoid issues

def cast_route_to_string(df_in, candidates=None):
    df = df_in.copy()
    if candidates is None:
        candidates = ['STATION_ID', 'ROUTE_ID', 'TRAIN_ID', 'TRAIN_NUMBER', 'ROUTE', 'STATION']
    for c in candidates:
        if c in df.columns:
            try:
                df[c] = df[c].astype(str)
            except Exception:
                continue
    return df

# Example to protect the pipeline: apply type casting when needed
try:
    df_safe = cast_route_to_string(df)
    _log('cast_route_to_string applied')
except Exception as e:
    _log('casting failed: ' + str(e))

# If index is multi-level with non-string values: cast index levels to string for grouping convenience
if isinstance(df_safe.index, pd.MultiIndex):
    try:
        df_safe.index = df_safe.index.set_levels([lev.astype(str) for lev in df_safe.index.levels])
        _log('MultiIndex levels cast to str')
    except Exception as e:
        _log('Failed to cast MultiIndex levels: ' + str(e))

_log('Edge case strategies tested')

In [None]:
# Section 8: Integrate corrected helpers into pipeline and save results
# This cell shows how to use the safe helpers and optionally replace existing helpers in the main notebook.
# If you want to persist these helpers to a utility module, create a file and import in other notebooks.

# Example: apply to the main train_df and keep the original 'train_df' updated
try:
    train_df = compute_prev_delay_safe(train_df)
    train_df = compute_rolling_features_safe(train_df)
    _log('train_df updated with safe prev_delay & rolling features')
except Exception as e:
    _log('Applying to global train_df failed: ' + str(e))

# Final checks
try:
    print('Columns now include:', [c for c in ['PREV_DELAY', 'ROLLING_MEAN_DELAY_7D', 'ROLLING_MEAN_DELAY_30D'] if c in train_df.columns])
    print('Sample:')
    display(train_df[['TARGET','PREV_DELAY','ROLLING_MEAN_DELAY_7D','ROLLING_MEAN_DELAY_30D']].head())
except Exception as e:
    _log('Cannot display final sample: ' + str(e))

# Optionally, replace the util function definitions in this repo: write to file 'src/utils/feature_helpers.py' (not overwriting original functions here)
# To keep changes minimal, consider copy-pasting these functions into the utilities file (or import them here).