In [None]:
!pytest tests/integration

In [None]:
# !pip install -e .

In [None]:
from automl_tool.automl import AutoML
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np


In [None]:

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='benign')

# Split the dataset first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now add missing values directly to the training target
np.random.seed(42)
missing_indices = np.random.choice(y_train.index, size=10, replace=False)
y_train.loc[missing_indices] = None

# Initialize and fit the AutoML estimator
automl = AutoML(X_train, y_train, "benign")
automl.fit_pipeline()

In [None]:
y_train.value_counts(dropna=False)

In [None]:
automl.fitted_pipeline

In [None]:
automl.get_feature_importance_scores()
automl.plot_feature_importance_scores(top_k=15)
automl.feature_importance_plot

## Time Series

In [None]:
## TODO -- add feature: parameter to vary forecast window

In [None]:
from automl_tool.preprocessing import ts_train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta

electric_tbl = pd.read_csv("/Users/andrewcarr/andrew_carr_website/posts/auto_ml/input_data/electric_production.csv")

plt.style.use("opinionated_rc")
plt.rcParams.update({'grid.linestyle': '-',})

# Convert date column to datetime format 
electric_tbl['date'] = pd.to_datetime(electric_tbl['date'])

# Plot the data
plt.figure(figsize=(11, 4))
plt.plot(electric_tbl['date'], electric_tbl['electricity_production'],
 label='Electricity Production', color='blue', linewidth = 1)

# Add labels and title
plt.ylabel('Electricity Production', size = 9, loc = 'center')
plt.title('Electricity Production Over Time', size = 16)

# Set axis tick fontsize 
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)

# Show the plot
plt.show()


In [None]:
# Input dataframe and outcome variable (note - input dataframe includes outcome)
X, y = electric_tbl, electric_tbl["electricity_production"]

# Outcome variable and date names 
outcome_var, date_var = "electricity_production", "date"

# Feature derivation and holdout windows
fdw, holdout_window = 18, 24

X_train, X_holdout, y_train, y_holdout = ts_train_test_split(X, y, outcome_var, date_var, fdw, holdout_window, forecast_window=18)

electric_automl_estimator = AutoML(X_train, y_train, "electricity_production", time_series=True)

electric_automl_estimator.fit_pipeline(holdout_window=holdout_window)

electric_automl_estimator.get_backtest_plots()

In [None]:
# Compare multiple forecast windows: 1, 6, 12, 18
forecast_windows = [1, 6, 12, 18]
colors = ['red', 'orange', 'green', 'blue']
models = {}
predictions = {}

for i, fw in enumerate(forecast_windows):
    # print(f"\n--- Training model for forecast window: {fw} months ---")
    
    # Split data with current forecast window
    X_train_fw, X_holdout_fw, y_train_fw, y_holdout_fw = ts_train_test_split(
        electric_tbl, electric_tbl["electricity_production"], 
        "electricity_production", "date", 
        fdw=18, holdout_window=24, forecast_window=fw
    )
    
    # Train AutoML model
    automl_fw = AutoML(X_train_fw, y_train_fw, "electricity_production", time_series=True)
    automl_fw.fit_pipeline(holdout_window=24)
    
    # Get predictions
    preds_fw = automl_fw.fitted_pipeline.best_estimator_.predict(X_holdout_fw)
    
    # Store results
    models[fw] = automl_fw
    predictions[fw] = {
        'X_holdout': X_holdout_fw,
        'y_holdout': y_holdout_fw,
        'predictions': preds_fw
    }

print("\nAll models trained successfully!")

# Create comparison plot
fig, ax = plt.subplots(figsize=(20, 6))

# Plot actuals (use forecast window 1 as reference since they should all have same holdout period)
actual_values = predictions[1]['y_holdout'].to_numpy()
actual_index = predictions[1]['X_holdout'].index

ax.plot(actual_index, actual_values, label='Actual', color='black', linewidth=2)

# Plot predictions for each forecast window
for i, fw in enumerate(forecast_windows):
    pred_data = predictions[fw]
    ax.plot(pred_data['X_holdout'].index, pred_data['predictions'], 
            label=f'Predicted (FW={fw}m)', color=colors[i], 
            linestyle='--', alpha=0.8, linewidth=1.5)

# Styling
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Electricity Production', size=12, loc='center')
ax.set_title('Electricity Production: Forecast Window Comparison (Holdout Set)', size=18)

# Add legend
ax.legend(fontsize=12, loc='upper left', bbox_to_anchor=(0.02, 0.98))

# Add caption explaining forecast windows
fig.text(0.5, -0.05, 
         "Note: FW=Forecast Window. FW=1m means predictions made 1 month ahead, FW=18m means predictions made 18 months ahead.\nLonger forecast windows are more challenging and typically less accurate.", 
         wrap=True, horizontalalignment='center', fontsize=10)

# Grid
ax.grid(True, which='both', linestyle='-', linewidth=0.5, alpha=0.7)

# Tight layout
plt.tight_layout()
plt.show()

# Print summary statistics
print("\n=== Summary Statistics ===")
for fw in forecast_windows:
    pred_data = predictions[fw]
    mae = np.mean(np.abs(pred_data['predictions'] - pred_data['y_holdout'].to_numpy()))
    print(f"Forecast Window {fw}m - MAE: {mae:.2f}")

In [None]:
electric_automl_estimator.get_feature_importance_scores()


In [None]:
electric_automl_estimator.plot_feature_importance_scores()

electric_automl_estimator.feature_importance_plot

## Time Series Evaluation (Multiple Datasets)


In [None]:
# Install automl_tool in editable mode
!pip install -e .


In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import mean_absolute_error
from automl_tool.automl import AutoML
# from automl_tool.preprocessing import ts_train_test_split
from pandas.tseries.frequencies import to_offset
from lightgbm import LGBMClassifier, LGBMRegressor

warnings.filterwarnings("ignore")
np.random.seed(42)

# Helpers
make_dates = lambda n, freq='ME', start='2000-01-01': pd.date_range(start, periods=n, freq=freq)

def df_from_values(values, start='2000-01-01', freq='ME'):
    return pd.DataFrame({'date': make_dates(len(values), freq, start), 'value': np.asarray(values, dtype=float)})

def ensure_regular_frequency(df: pd.DataFrame, date_col: str = 'date', value_col: str = 'value') -> pd.DataFrame:
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col])
    d = d.sort_values(date_col).drop_duplicates(subset=[date_col])
    # Try to infer frequency; fallback to median delta if needed
    freq = pd.infer_freq(d[date_col])
    if freq is None:
        deltas = d[date_col].diff().dropna()
        if len(deltas) == 0:
            freq = 'D'
        else:
            td = deltas.median()
            try:
                freq = to_offset(td)
            except Exception:
                freq = 'D'
    full_index = pd.date_range(d[date_col].iloc[0], d[date_col].iloc[-1], freq=freq)
    d = d.set_index(date_col).reindex(full_index)
    # Interpolate missing values over time; fallback to ffill then bfill
    if d[value_col].isna().any():
        try:
            d[value_col] = d[value_col].interpolate(method='time')
        except Exception:
            d[value_col] = d[value_col].ffill().bfill()
    d = d.reset_index().rename(columns={'index': date_col})
    return d[[date_col, value_col]]

# Synthetic dataset generators (diverse shapes)
def synth_seasonal(n=1000):
    t = np.arange(n)
    y = 10*np.sin(2*np.pi*t/12) + 0.5*t + np.random.normal(0, 2, n)
    return df_from_values(y)

def synth_linear(n=1000):
    t = np.arange(n)
    y = 0.3*t + np.random.normal(0, 3, n)
    return df_from_values(y)

def synth_quadratic(n=1000):
    t = np.arange(n)
    y = 0.01*(t-n/2)**2 + np.random.normal(0, 2, n)
    return df_from_values(y)

def synth_logistic(n=1000):
    t = np.arange(n)
    midpoint = n/2
    y = 100/(1+np.exp(-(t-midpoint)/10)) + np.random.normal(0, 2, n)
    return df_from_values(y)

def synth_walk(n=1000):
    rng = np.random.default_rng(123)
    y = np.cumsum(rng.normal(0.3, 1.0, n))
    return df_from_values(y)

def synth_piecewise(n=1000):
    t = np.arange(n)
    k1 = int(n*0.3); k2 = int(n*0.65)
    base = np.piecewise(t,
                        [t < k1, (t >= k1) & (t < k2), t >= k2],
                        [lambda x: 0.2*x,
                         lambda x: (0.2*k1) + (-0.1)*(x-k1),
                         lambda x: (0.2*k1) + (-0.1)*(k2-k1) + 0.4*(x-k2)])
    y = base + np.random.normal(0, 2, n)
    return df_from_values(y)

def synth_spiky(n=1000):
    rng = np.random.default_rng(0)
    y = 20 + np.sin(2*np.pi*np.arange(n)/24) + rng.normal(0, 2, n)
    idx = rng.choice(n, size=max(10, n//30), replace=False)
    y[idx] += rng.uniform(10, 25, len(idx))
    return df_from_values(y)

def synth_multiseason(n=1000):
    t = np.arange(n)
    y = 5*np.sin(2*np.pi*t/12) + 3*np.sin(2*np.pi*t/6) + np.random.normal(0, 2, n)
    return df_from_values(y)

# Collect datasets (n=1000 each synthetic)
datasets = {
    'Seasonal+Trend': synth_seasonal(n=1000),
    'Linear Trend': synth_linear(n=1000),
    'Quadratic': synth_quadratic(n=1000),
    'Logistic (S-curve)': synth_logistic(n=1000),
    'Random Walk (drift)': synth_walk(n=1000),
    'Piecewise (changepoints)': synth_piecewise(n=1000),
    'Spiky Intermittent': synth_spiky(n=1000),
    'Multi-seasonal': synth_multiseason(n=1000),
}

# Add real datasets from statsmodels
try:
    import statsmodels.api as sm
    # Sunspots (yearly)
    try:
        sun = sm.datasets.sunspots.load_pandas().data
        df_sun = pd.DataFrame({
            'date': pd.to_datetime(sun['YEAR'], format='%Y', errors='coerce'),
            'value': sun['SUNACTIVITY'].astype(float)
        }).dropna()
        datasets['Sunspots'] = df_sun
    except Exception:
        pass

    # Mauna Loa CO2 (weekly)
    try:
        co2 = sm.datasets.co2.load_pandas().data
        co2 = co2.copy()
        if 'co2' in co2.columns and 'date' not in co2.columns:
            co2 = co2.reset_index().rename(columns={'index': 'date', 'co2': 'value'})
        else:
            if 'date' not in co2.columns:
                co2 = co2.reset_index().rename(columns={'index': 'date'})
            if 'value' not in co2.columns:
                first_val = [c for c in co2.columns if c != 'date'][0]
                co2 = co2.rename(columns={first_val: 'value'})
        co2 = co2[['date', 'value']].dropna()
        datasets['CO2'] = co2
    except Exception:
        pass
except Exception:
    pass

# Add selected FRED series
try:
    from pandas_datareader import data as pdr
    fred_codes = ['CPIAUCSL', 'UNRATE', 'INDPRO']
    for code in fred_codes:
        try:
            df_fred = pdr.DataReader(code, 'fred', start='1990-01-01')
            df_fred = df_fred.rename(columns={code: 'value'}).reset_index().rename(columns={'DATE': 'date'})
            df_fred = df_fred.dropna()
            datasets[code] = df_fred
        except Exception:
            continue
except Exception:
    pass

# Filter to only the datasets the user wants
allowed = {
    'Seasonal+Trend', 'Linear Trend', 'Quadratic', 'Logistic (S-curve)',
    'Random Walk (drift)', 'Piecewise (changepoints)', 'Spiky Intermittent',
    'Multi-seasonal', 'Sunspots', 'CO2', 'CPIAUCSL', 'UNRATE', 'INDPRO'
}
datasets = {k: v for k, v in datasets.items() if k in allowed}

winners = []
# Normalize to equidistant dates
for k in list(datasets.keys()):
    datasets[k] = ensure_regular_frequency(datasets[k], 'date', 'value')

fdw, holdout_window, forecast_window = 18, 24, 1
maes = []

for name, df in datasets.items():
    if len(df) < fdw + holdout_window + forecast_window + 1:
        continue
    X, y = df, df['value']
    X_train, X_holdout, y_train, y_holdout = ts_train_test_split(
        X, y, 'value', 'date', fdw, holdout_window, forecast_window=forecast_window
)
    # Skip datasets that are too short for 5-fold TimeSeriesSplit with this test size
    if len(X_train) <= 5 * holdout_window:
        continue
    automl_mod = AutoML(X_train, y_train, 'value', time_series=True)
    automl_mod.fit_pipeline(holdout_window=holdout_window)
    preds = automl_mod.fitted_pipeline.best_estimator_.predict(X_holdout)
    mae = mean_absolute_error(y_holdout, preds)
    maes.append(mae)
    print(f"{name}: {mae:.3f}")
	# ... inside your datasets loop, after fit:
    winners.append(type(automl_mod.fitted_pipeline.best_estimator_.get_params()['model']).__name__)

if maes:
    print(f"Average MAE: {float(np.mean(maes)):.3f}")

### XGB forecasting block

In [None]:
import pandas as pd 
from typing import Optional, Tuple

def ts_train_test_split(
    X: pd.DataFrame, 
    y: pd.Series, 
    outcome_col: str, 
    date_col: str, 
    fdw: int, 
    holdout_window: int,
    forecast_window: Optional[int] = 1 
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    """
    Apply preprocessing and split the data into training and testing sets for time series modeling.
    """

    # Helper function to preprocess ts data
    def _ts_preproc(inp_tbl: pd.DataFrame, inp_y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:   
        preproc_tbl = (inp_tbl
        .pipe(lambda x: x.assign(**{f"lagged_{outcome_col}_{i}m": x[outcome_col].shift(i) for i in range(forecast_window, fdw + 1)}))
        .pipe(lambda x: x.assign(**{f"inv_hyp_sin_lagged_{outcome_col}_{i}m": np.arcsinh(x[outcome_col].shift(i)) for i in range(forecast_window, fdw + 1)}))
        .pipe(lambda x: x.assign(**{f"rolling_avg_{outcome_col}_{i}m": x[outcome_col].shift(1).rolling(window=i).mean() for i in range(forecast_window, fdw + 1)}))
        .pipe(lambda x: x.assign(**{f"min_{outcome_col}_{i}m": x[outcome_col].shift(1).rolling(window=i).min() for i in range(forecast_window, fdw + 1)}))
        # New time and seasonal features
        .pipe(lambda x: x.assign(
            # t=np.arange(len(x)),
            monthsin=np.sin(2 * np.pi * pd.to_datetime(x[date_col]).dt.month / 12.0),
            monthcos=np.cos(2 * np.pi * pd.to_datetime(x[date_col]).dt.month / 12.0),
        ))
        # Drop the original date and outcome columns
        .drop([date_col, outcome_col], axis=1)
        # Rowwise deletion of missing values
        .dropna(axis=0)
        )
        preproc_y = inp_y.loc[preproc_tbl.index]

        return preproc_tbl, preproc_y

    # Reset index of X and y
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    # Calculate the index to split the data
    train_end_index = X.shape[0] - (holdout_window)
    test_start_index = X.shape[0] - (fdw + holdout_window)

    # Split the data
    X_train = X.iloc[:train_end_index]
    X_test = X.iloc[test_start_index:]
    y_train = y.iloc[:train_end_index]
    y_test = y.iloc[test_start_index:]

    # Set the indices of both X and y train/test to the 'date' column 
    X_train.set_index(date_col, drop=False, inplace=True)
    y_train.index = X_train.index
    X_test.set_index(date_col, drop=False, inplace=True)
    y_test.index = X_test.index

    # Preprocess the data
    X_train, y_train = _ts_preproc(X_train, y_train)
    X_test, y_test = _ts_preproc(X_test, y_test)

    return X_train, X_test, y_train, y_test


In [None]:
from xgboost import XGBRegressor
from automl_tool.estimation import XGBWithEarlyStoppingRegressor
from pandas_datareader import data as pdr
from automl_tool.preprocessing import ts_train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np 

fdw, holdout_window, forecast_window = 12, 24, 1

df_fred = pdr.DataReader("CPIAUCSL", 'fred', start='1990-01-01')
df_fred = df_fred.rename(columns={'CPIAUCSL': 'value'}).reset_index().rename(columns={'DATE': 'date'})
df_fred = df_fred.dropna()

X, y = df_fred, df_fred['value']
X_train, X_holdout, y_train, y_holdout = ts_train_test_split(
	X, y, 'value', 'date', fdw, holdout_window, forecast_window=forecast_window
)

xgb_model = XGBWithEarlyStoppingRegressor()
xgb_model.fit(X_train, y_train)

preds = xgb_model.predict(X_holdout)
mae = mean_absolute_error(y_holdout, preds)
print(f"XGBRegressor MAE on CPIAUCSL: {mae:.3f}")


In [None]:
xgb_model

#### AutoML forecasting block

In [None]:
from automl_tool.automl import AutoML

automl_mod = AutoML(X_train, y_train, 'value', time_series=True)

automl_mod.fit_pipeline(holdout_window=holdout_window)

# Get the best model from the fitted pipeline
y_preds = automl_mod.fitted_pipeline.predict(X_holdout)

mean_absolute_error(y_holdout, y_preds)


In [None]:

# Plot actual and predicts on holdout set 
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(y_holdout.index, y_holdout, label='Actual', color='blue')
plt.plot(y_holdout.index, y_preds, label='Predicted', color='orange')
legend = plt.legend(loc='upper left', fontsize=12)
plt.title('Predictions vs Actuals on Holdout Set')


In [None]:
automl_mod.fitted_pipeline

### Scalecast forecasting block


In [None]:

import pandas as pd
from scalecast.Forecaster import Forecaster
from scalecast import GridGenerator

GridGenerator.get_example_grids()  # example hyperparameter grids

data = df_fred
f = Forecaster(
    y=data['value'],               # required
    current_dates=data['date'],    # required
    future_dates=1,               # length of the forecast horizon
    test_length=24,                 # set a test set length or fraction to validate all models if desired
    cis=False,                     # choose whether or not to evaluate confidence intervals for all models
)
f.set_estimator('xgboost')  # select an estimator

f.auto_Xvar_select()       # find best look-back, trend, and seasonality for your series
f.cross_validate(k=3)       # tune model hyperparams using time series cross validation
f.auto_forecast()           # automatically forecast with the chosen Xvars and hyperparams

results = f.export(['lvl_fcsts','model_summaries'])


In [None]:
ts_preds = f.export('lvl_test_set_predictions')

mean_absolute_error(ts_preds['actual'], ts_preds['xgboost'])

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(y_holdout.index, ts_preds['actual'], label='Actual', color='blue')
plt.plot(y_holdout.index, ts_preds['xgboost'], label='Predicted', color='orange')
legend = plt.legend(loc='upper left', fontsize=12)
plt.title('XGBWithEarlyStoppingRegressor Predictions vs Actuals on Holdout Set')

In [None]:
from scalecast import GridGenerator
GridGenerator.get_example_grids()   # writes Grids.py to your working dir (if not already present)

# then either open Grids.py in your editor, or import it:
from Grids import xgboost as xgb_grid
print(xgb_grid)


In [None]:
results['model_summaries']['HyperParams'][0]