

**Purpose:** This notebook uses machine learning to forecast the future performance of a stock to be used as features in a downstream classifier


**Description:** Code will read in the base data file and create a series of forecasts, pick the best ones and use as features

**Prerequisites:**
- Environment: kedro-test (Python 3.10.16)
- Required data: Base machine learning dataset for modeling
- Key dependencies: Pandas, Sklearn

**Inputs:** see above


**Outputs:** [Generated files and artifacts]

**Usage:**
1. Ensure prerequisites are met
2. Verify input data availability
3. Run cells sequentially
4. Check outputs in specified directories

**⚠️ Notes:** [Any important warnings or considerations]

---


In [51]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional
from statsmodels.tsa.arima.model import ARIMA

In [None]:
# functions:

def make_sliding_window_split_df(df: pd.DataFrame, group_col: str, time_col: str, window_size: int, horizon: int = 1, step: int = 1, max_splits: Optional[int] = None
) -> pd.DataFrame:
    """
    Generate sliding window train/test splits for time series data grouped by a key column.

    Args:
        df (pd.DataFrame): Original dataframe containing the time series data.
        group_col (str): Name of the column to group by (e.g., ticker symbol).
        time_col (str): Name of the datetime column used for sorting within groups.
        window_size (int): Number of consecutive days used for the training window.
        horizon (int, optional): Number of days to forecast in the test window. Defaults to 1.
        step (int, optional): Number of days to move the sliding window forward each iteration. Defaults to 1.
        max_splits (int, optional): Maximum number of splits to generate per group. Defaults to None (no limit).

    Returns:
        pd.DataFrame: A DataFrame where each row represents a train/test split with columns:
            - 'ticker': Group value (e.g., stock ticker).
            - 'split_id': Integer split index within each group.
            - 'train_start': Start date of training window.
            - 'train_end': End date of training window.
            - 'test_start': Start date of testing window.
            - 'test_end': End date of testing window.
            - 'train_idx': List of original dataframe indices used for training.
            - 'test_idx': List of original dataframe indices used for testing.
            - 'horizon': Number of days forecasted (test window size).
            - 'window_size': Number of days in training window.
    """
    df = df.copy()
    df = df.sort_values([group_col, time_col]).reset_index(drop=False)
    full_idx = df['index'].values  # preserve original indices

    records = []

    for group_val, group_df in df.groupby(group_col):
        group_df = group_df.reset_index(drop=True)
        group_indices = group_df.index.to_numpy()
        n_samples = len(group_df)
        split_count = 0

        for start in range(0, n_samples - window_size - horizon + 1, step):
            if max_splits is not None and split_count >= max_splits:
                break

            train_idx = group_indices[start : start + window_size]
            test_idx = group_indices[start + window_size : start + window_size + horizon]

            records.append({
                'ticker': group_val,
                'split_id': split_count,
                'train_start': group_df.loc[train_idx[0], time_col],
                'train_end': group_df.loc[train_idx[-1], time_col],
                'test_start': group_df.loc[test_idx[0], time_col],
                'test_end': group_df.loc[test_idx[-1], time_col],
                'train_idx': full_idx[train_idx].tolist(),
                'test_idx': full_idx[test_idx].tolist(),
                'horizon': horizon,
                'window_size': window_size
            })

            split_count += 1

    return pd.DataFrame(records)



In [None]:
# parameters:
parameters = {
    'group_col' : 'ticker',
    'time_col' : 'date', 
    'training_days' : 150,
    'forecast_horizon' : 30,
    'step' : 1,
    'max_windows_per_group' : None,
    'with_replacement' : True,
    'target_variable'  : 'adj_close',
    # specify the time series arguments to be used:
    'arima_order' : [1, 1, 2]
    
    }  


In [38]:
# import data:

df = pd.read_csv('../data/03_primary/combined_modeling_input.csv')

# Update data types:
# make sure date types are correct:
df['date'] = pd.to_datetime(df['date'])


In [39]:
df.head()

Unnamed: 0,date,open,high,low,close,adj_close,volume,ticker,7_close_sma,14_close_sma,...,cum_days_above_above_14_close_sma_ind,cum_days_above_above_21_close_sma_ind,upper_bollinger_band,lower_bollinger_band,bol_pct_from_top,bol_pct_from_bottom,bol_range,bol_range_pct,target_20_days_ahead,target_20_days_ahead_ind
0,2019-01-02,38.7225,39.712502,38.557499,39.48,37.75008,148158800,AAPL,,,...,0.0,0.0,,,,,,,41.610001,1
1,2019-01-03,35.994999,36.43,35.5,35.547501,33.989902,365248800,AAPL,,,...,0.0,0.0,,,,,,,41.630001,1
2,2019-01-04,36.1325,37.137501,35.950001,37.064999,35.440895,234428400,AAPL,,,...,0.0,0.0,,,,,,,42.8125,1
3,2019-01-07,37.174999,37.2075,36.474998,36.982498,35.362019,219111200,AAPL,,,...,0.0,0.0,,,,,,,43.544998,1
4,2019-01-08,37.389999,37.955002,37.130001,37.6875,36.036129,164101200,AAPL,,,...,0.0,0.0,,,,,,,43.560001,1


In [41]:

# define the training and test split windows:


df_split = make_sliding_window_split_df(df = df, group_col = 'ticker', time_col = 'date', window_size = 150, horizon = 30, step = 1, max_splits = None)

# check the split:
df_split.head()

# check the split:
df_split.tail()



Unnamed: 0,ticker,split_id,train_start,train_end,test_start,test_end,train_idx,test_idx,horizon,window_size
2227,XLF,739,2021-12-07,2022-07-13,2022-07-14,2022-08-24,"[739, 740, 741, 742, 743, 744, 745, 746, 747, ...","[889, 890, 891, 892, 893, 894, 895, 896, 897, ...",30,150
2228,XLF,740,2021-12-08,2022-07-14,2022-07-15,2022-08-25,"[740, 741, 742, 743, 744, 745, 746, 747, 748, ...","[890, 891, 892, 893, 894, 895, 896, 897, 898, ...",30,150
2229,XLF,741,2021-12-09,2022-07-15,2022-07-18,2022-08-26,"[741, 742, 743, 744, 745, 746, 747, 748, 749, ...","[891, 892, 893, 894, 895, 896, 897, 898, 899, ...",30,150
2230,XLF,742,2021-12-10,2022-07-18,2022-07-19,2022-08-29,"[742, 743, 744, 745, 746, 747, 748, 749, 750, ...","[892, 893, 894, 895, 896, 897, 898, 899, 900, ...",30,150
2231,XLF,743,2021-12-13,2022-07-19,2022-07-20,2022-08-30,"[743, 744, 745, 746, 747, 748, 749, 750, 751, ...","[893, 894, 895, 896, 897, 898, 899, 900, 901, ...",30,150


In [90]:
def train_time_series_models(df: pd.DataFrame, splits: pd.DataFrame, parameters: dict) -> pd.DataFrame:

    ''' Modeling for the time series features to go into the model; we will start with a simple ARIMA for this

    Args:
        df: Dataframe containing the time series data
        splits: dataframe containing the splits for the time series data
        parameters: dictionary containing the parameters for the model

    Returns:
        dataframe containing the model results

    '''
    #TODO - add in autoa arima capability for the FE step

    split_test = splits[0:1]
    print(split_test)

    iterations = 0

    for index, split in split_test.iterrows():
        # set variables for the split:
        ticker = split['ticker']
        train_start = pd.to_datetime(split['train_start'])
        train_end = pd.to_datetime(split['train_end'])

        # create a training set for each split:
        training_set = df[(df['ticker'] == ticker) &
                      (df['date'] >= train_start) & 
                      (df['date'] <= train_end)]
        # set the training time series for the ARIMA:
        train_ts = training_set.set_index('date')['adj_close']

        # create a test set for each split:
        training_set = df[(df['ticker'] == ticker) &
                      (df['date'] >= train_start) & 
                      (df['date'] <= train_end)]
        # set the test time series for the ARIMA:
        train_ts = training_set.set_index('date')['adj_close']




        # create a test set for each split:

        #test_set = df[df['ticker'] == split['ticker'] & df['date'] >= split['test_start'] & df['date'] <= split['test_end']]

        # predict the future for the test set:
        #model = ARIMA(training_set[parameters['target_variable']], order=parameters['arima_order'])
       
        #predictions = model_fit.predict(start=training_set['train_start'], end=training_set['train_end'], dynamic=False)

        # add the predictions to the test set:
        

    
    return training_set

    


In [91]:
train_time_series_models(df = df, splits = df_split, parameters = parameters)

  ticker  split_id train_start  train_end test_start   test_end  \
0   AAPL         0  2019-01-02 2019-08-06 2019-08-07 2019-09-18   

                                           train_idx  \
0  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...   

                                            test_idx  horizon  window_size  
0  [150, 151, 152, 153, 154, 155, 156, 157, 158, ...       30          150  


Unnamed: 0,date,open,high,low,close,adj_close,volume,ticker,7_close_sma,14_close_sma,...,cum_days_above_above_14_close_sma_ind,cum_days_above_above_21_close_sma_ind,upper_bollinger_band,lower_bollinger_band,bol_pct_from_top,bol_pct_from_bottom,bol_range,bol_range_pct,target_20_days_ahead,target_20_days_ahead_ind
0,2019-01-02,38.722500,39.712502,38.557499,39.480000,37.750080,148158800,AAPL,,,...,0.0,0.0,,,,,,,41.610001,1
1,2019-01-03,35.994999,36.430000,35.500000,35.547501,33.989902,365248800,AAPL,,,...,0.0,0.0,,,,,,,41.630001,1
2,2019-01-04,36.132500,37.137501,35.950001,37.064999,35.440895,234428400,AAPL,,,...,0.0,0.0,,,,,,,42.812500,1
3,2019-01-07,37.174999,37.207500,36.474998,36.982498,35.362019,219111200,AAPL,,,...,0.0,0.0,,,,,,,43.544998,1
4,2019-01-08,37.389999,37.955002,37.130001,37.687500,36.036129,164101200,AAPL,,,...,0.0,0.0,,,,,,,43.560001,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,2019-07-31,54.105000,55.342499,52.825001,53.259998,51.341656,277125600,AAPL,52.277500,51.707143,...,8.0,39.0,52.980339,49.715137,0.005279,0.071303,3.265203,0.061307,51.382500,0
146,2019-08-01,53.474998,54.507500,51.685001,52.107498,50.230667,216071600,AAPL,52.262857,51.798750,...,9.0,40.0,53.050522,49.780668,-0.017776,0.046742,3.269854,0.062752,52.252499,1
147,2019-08-02,51.382500,51.607498,50.407501,51.005001,49.167881,163448400,AAPL,52.096785,51.777500,...,0.0,0.0,53.050162,49.771743,-0.038551,0.024778,3.278419,0.064276,52.185001,1
148,2019-08-05,49.497501,49.662498,48.145000,48.334999,46.594040,209572000,AAPL,51.608214,51.578214,...,0.0,0.0,53.398791,49.163828,-0.094830,-0.016859,4.234963,0.087617,51.424999,1


In [85]:

df.head()


#train_time_series_models(df = df, splits = df_split, parameters = parameters)

Unnamed: 0,date,open,high,low,close,adj_close,volume,ticker,7_close_sma,14_close_sma,...,cum_days_above_above_14_close_sma_ind,cum_days_above_above_21_close_sma_ind,upper_bollinger_band,lower_bollinger_band,bol_pct_from_top,bol_pct_from_bottom,bol_range,bol_range_pct,target_20_days_ahead,target_20_days_ahead_ind
0,2019-01-02,38.7225,39.712502,38.557499,39.48,37.75008,148158800,AAPL,,,...,0.0,0.0,,,,,,,41.610001,1
1,2019-01-03,35.994999,36.43,35.5,35.547501,33.989902,365248800,AAPL,,,...,0.0,0.0,,,,,,,41.630001,1
2,2019-01-04,36.1325,37.137501,35.950001,37.064999,35.440895,234428400,AAPL,,,...,0.0,0.0,,,,,,,42.8125,1
3,2019-01-07,37.174999,37.2075,36.474998,36.982498,35.362019,219111200,AAPL,,,...,0.0,0.0,,,,,,,43.544998,1
4,2019-01-08,37.389999,37.955002,37.130001,37.6875,36.036129,164101200,AAPL,,,...,0.0,0.0,,,,,,,43.560001,1


In [None]:
# create baseline forecaster:

def create_baseline_forecaster(df: pd.DataFrame, parameters: dict) -> pd.DataFrame:

In [None]:
# Updated training function with ARIMA PDQ selection
def train_time_series_models_with_arima(df: pd.DataFrame, splits: pd.DataFrame, parameters: dict) -> pd.DataFrame:
    """
    Train ARIMA models with automatic PDQ selection
    
    Args:
        df: Dataframe containing the time series data
        splits: dataframe containing the splits for the time series data
        parameters: dictionary containing the parameters for the model
    
    Returns:
        dataframe containing the model results
    """
    
    results = []
    
    # Test on first few splits
    split_test = splits.head(5)  # Start with 5 splits for testing
    
    for index, split in split_test.iterrows():
        try:
            # Get training data using the indices
            train_indices = split['train_idx']
            train_data = df.iloc[train_indices]
            
            # Focus on close price for ARIMA
            price_series = train_data['close']
            
            # 1. Determine differencing order
            d_order = determine_differencing_order(price_series)
            print(f"Ticker: {split['ticker']}, Split: {split['split_id']}, Suggested D: {d_order}")
            
            # 2. Create focused PDQ grid around determined D
            focused_pdq = []
            for p in [0, 1, 2]:
                for q in [0, 1, 2]:
                    focused_pdq.append((p, d_order, q))
            
            # 3. Select best PDQ
            best_pdq, best_model, best_aic = select_best_arima_pdq(price_series, focused_pdq)
            
            if best_pdq is not None and best_model is not None:
                # 4. Make predictions
                forecast_steps = split['horizon']
                forecast = best_model.forecast(steps=forecast_steps)
                
                # Store results
                results.append({
                    'ticker': split['ticker'],
                    'split_id': split['split_id'],
                    'best_pdq': best_pdq,
                    'aic': best_aic,
                    'forecast_mean': forecast.mean(),
                    'forecast_std': forecast.std(),
                    'train_start': split['train_start'],
                    'train_end': split['train_end'],
                    'test_start': split['test_start'],
                    'test_end': split['test_end']
                })
                
                print(f"Best PDQ: {best_pdq}, AIC: {best_aic:.2f}")
            
        except Exception as e:
            print(f"Error processing split {split['split_id']}: {str(e)}")
            continue
    
    return pd.DataFrame(results)


In [None]:
# Test the ARIMA PDQ selection
print("Testing ARIMA PDQ Selection...")
print("="*50)

# Install required package if not already installed
try:
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.stattools import adfuller
    print("✓ Statsmodels is available")
except ImportError:
    print("❌ Please install statsmodels: pip install statsmodels")

# Run the ARIMA training
try:
    arima_results = train_time_series_models_with_arima(df=df, splits=df_split, parameters=parameters)
    
    # Display results
    if not arima_results.empty:
        print("\nARIMA Results Summary:")
        print(arima_results[['ticker', 'split_id', 'best_pdq', 'aic', 'forecast_mean']].head(10))
    else:
        print("No results generated. Check for errors above.")
        
except Exception as e:
    print(f"Error running ARIMA analysis: {e}")
    print("You may need to install statsmodels: pip install statsmodels")


In [None]:
# AUTO ARIMA - Much Better Approach!
# Install: pip install pmdarima

def train_auto_arima_models(df: pd.DataFrame, splits: pd.DataFrame, parameters: dict) -> pd.DataFrame:
    """
    Train ARIMA models using auto_arima for automatic PDQ optimization
    
    Args:
        df: Dataframe containing the time series data
        splits: dataframe containing the splits for the time series data  
        parameters: dictionary containing the parameters for the model
    
    Returns:
        dataframe containing the model results
    """
    
    try:
        from pmdarima import auto_arima
        print("✓ pmdarima is available")
    except ImportError:
        print("❌ Please install pmdarima: pip install pmdarima")
        return pd.DataFrame()
    
    results = []
    
    # Test on first few splits
    split_test = splits.head(10)  # Test with 10 splits
    
    for index, split in split_test.iterrows():
        try:
            # Get training data using the indices
            train_indices = split['train_idx']
            train_data = df.iloc[train_indices]
            
            # Focus on target variable for ARIMA
            target_col = parameters.get('target_variable', 'adj_close')
            price_series = train_data[target_col].dropna()
            
            print(f"\\nProcessing: {split['ticker']}, Split: {split['split_id']}")
            
            # AUTO ARIMA - This does all the work!
            auto_model = auto_arima(
                price_series,
                start_p=0, start_q=0,      # Starting values
                max_p=3, max_q=3,          # Maximum values to test
                seasonal=False,             # No seasonality for daily stock prices
                stepwise=True,             # Use stepwise algorithm (faster)
                suppress_warnings=True,
                error_action='ignore',
                trace=False                # Set to True to see search process
            )
            
            # Get the optimal order
            optimal_order = auto_model.order
            aic_score = auto_model.aic()
            
            print(f"Optimal PDQ: {optimal_order}, AIC: {aic_score:.2f}")
            
            # Make predictions
            forecast_steps = split['horizon']
            forecast, conf_int = auto_model.predict(n_periods=forecast_steps, 
                                                   return_conf_int=True)
            
            # Store results
            results.append({
                'ticker': split['ticker'],
                'split_id': split['split_id'],
                'optimal_pdq': optimal_order,
                'aic': aic_score,
                'forecast_mean': forecast.mean(),
                'forecast_std': forecast.std(),
                'forecast_values': forecast.tolist(),
                'confidence_lower': conf_int[:, 0].tolist(),
                'confidence_upper': conf_int[:, 1].tolist(),
                'train_start': split['train_start'],
                'train_end': split['train_end'],
                'test_start': split['test_start'],
                'test_end': split['test_end'],
                'train_size': len(price_series)
            })
            
        except Exception as e:
            print(f"Error processing {split['ticker']} split {split['split_id']}: {str(e)}")
            continue
    
    return pd.DataFrame(results)


In [None]:
# Test Auto ARIMA
print("Testing Auto ARIMA...")
print("="*50)

# Run the Auto ARIMA training
try:
    auto_arima_results = train_auto_arima_models(df=df, splits=df_split, parameters=parameters)
    
    # Display results
    if not auto_arima_results.empty:
        print("\n🎯 Auto ARIMA Results Summary:")
        print("-" * 40)
        display_cols = ['ticker', 'split_id', 'optimal_pdq', 'aic', 'forecast_mean']
        print(auto_arima_results[display_cols].head())
        
        # Show PDQ distribution
        print("\n📊 PDQ Parameter Distribution:")
        pdq_counts = auto_arima_results['optimal_pdq'].value_counts()
        for pdq, count in pdq_counts.head(5).items():
            print(f"  {pdq}: {count} times")
            
    else:
        print("No results generated. Install pmdarima first.")
        
except Exception as e:
    print(f"Error: {e}")
    print("\n💡 To use Auto ARIMA, install pmdarima:")
    print("    pip install pmdarima")


In [None]:
# Why Auto ARIMA is Better - Comparison

print("🔄 ARIMA Optimization Methods Comparison")
print("=" * 50)

print("""
📊 MANUAL GRID SEARCH (What we did before):
   ❌ Tests all P,D,Q combinations (3×3×3 = 27 models)
   ❌ Very slow for large datasets  
   ❌ May miss optimal combinations
   ❌ No intelligent search strategy
   ❌ Fixed search space

🚀 AUTO ARIMA (pmdarima package):
   ✅ Uses stepwise algorithm - much faster
   ✅ Automatically tests stationarity
   ✅ Intelligent search with information criteria
   ✅ Handles seasonality detection
   ✅ Can expand search space if needed
   ✅ Built-in model validation
   
💡 RECOMMENDED USAGE:
""")

code_example = '''
# Simple Auto ARIMA
from pmdarima import auto_arima

model = auto_arima(
    price_series,
    seasonal=False,     # For daily stock prices
    stepwise=True,      # Fast stepwise algorithm  
    suppress_warnings=True,
    error_action='ignore'
)

optimal_pdq = model.order  # Gets best (p,d,q)
'''

print(code_example)

print("""
🎯 KEY BENEFITS FOR STOCK FORECASTING:
   • Automatically determines if differencing is needed (D parameter)
   • Finds optimal P,Q without exhaustive search
   • Much faster than grid search
   • More robust model selection
   • Handles edge cases automatically
""")


In [27]:
df.shape

(2769, 31)

In [49]:
df_split.head()

Unnamed: 0,ticker,split_id,train_start,train_end,test_start,test_end,train_idx,test_idx,horizon,window_size
0,AAPL,0,2019-01-02,2019-08-06,2019-08-07,2019-09-18,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[150, 151, 152, 153, 154, 155, 156, 157, 158, ...",30,150
1,AAPL,1,2019-01-03,2019-08-07,2019-08-08,2019-09-19,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[151, 152, 153, 154, 155, 156, 157, 158, 159, ...",30,150
2,AAPL,2,2019-01-04,2019-08-08,2019-08-09,2019-09-20,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...","[152, 153, 154, 155, 156, 157, 158, 159, 160, ...",30,150
3,AAPL,3,2019-01-07,2019-08-09,2019-08-12,2019-09-23,"[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...","[153, 154, 155, 156, 157, 158, 159, 160, 161, ...",30,150
4,AAPL,4,2019-01-08,2019-08-12,2019-08-13,2019-09-24,"[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...","[154, 155, 156, 157, 158, 159, 160, 161, 162, ...",30,150
