# Stock Price Prediction Data Preparation for XGBoost

This notebook prepares stock data for prediction using XGBoost models for three different time periods:
1. Next day close price
2. Next week average close price
3. Next month average close price

For each stock, we'll create three separate datasets specifically formatted for each prediction period.

In [4]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
import xgboost as xgb
from sklearn.model_selection import train_test_split # Though we'll use manual split for time series
from sklearn.metrics import mean_squared_error
warnings.filterwarnings('ignore')

In [5]:



# List of stock symbols
stocks = ["AAPL", "MSFT", "GOOG", "AMZN", "TSLA", 
          "META", "NVDA", "SPY", "V", "DIS",
          "NFLX", "PYPL", "BABA", "IBM", "AMD",
          "BA", "INTC", "T", "GS", "NKE"]

# Path to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# Paths for input and output data
input_folder = os.path.join(project_root, "data", "xgboost","initial")
output_base_folder = os.path.join(project_root, "data", "xgboost")

# Create output folders for each prediction period
output_folders = {
    'day': os.path.join(output_base_folder, "day"),
    'week': os.path.join(output_base_folder, "week"),
    'month': os.path.join(output_base_folder, "month")
}

# Ensure output directories exist
for folder in output_folders.values():
    os.makedirs(folder, exist_ok=True)

In [None]:
def prepare_daily_prediction_data(df, stock_symbol):
    """
    Prepare data for next day close price prediction.
    The next_day_close and next_day_close_original columns are already present.
    """
    # Make a copy to avoid modifications to the original dataframe
    daily_df = df.copy()
    
    # Rename target columns for clarity
    daily_df = daily_df.rename(columns={
        'next_day_close': 'target',
        'next_day_close_original': 'target_original',
        'pct_change_next_day': 'pct_change_target'
    })
    
    # Drop rows with NaN target values (typically the last row)
    daily_df = daily_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['day'], f"{stock_symbol}_xgboost_day.csv")
    daily_df.to_csv(output_path, index=False)
    
    return output_path

In [None]:
def prepare_weekly_prediction_data(df, stock_symbol):
    """
    Prepare data for next week average close price prediction.
    We'll calculate the average close price for the next 5 trading days.
    """
    # Make a copy to avoid modifications to the original dataframe
    weekly_df = df.copy()
    
    # Calculate the average close price for the next 5 trading days
    # First, get the original close prices (non-normalized)
    close_original = weekly_df['close_original'].values
    
    # Initialize arrays for targets
    next_week_close = np.full(len(weekly_df), np.nan)
    next_week_close_original = np.full(len(weekly_df), np.nan)
    pct_change_next_week = np.full(len(weekly_df), np.nan)
    
    # For each day, calculate the average of the next 5 trading days
    for i in range(len(weekly_df) - 5):
        next_week_close[i] = np.mean(weekly_df['close'].values[i+1:i+6])
        next_week_close_original[i] = np.mean(close_original[i+1:i+6])
        # Calculate percentage change from current close to next week average
        pct_change_next_week[i] = (next_week_close_original[i] / close_original[i]) - 1
    
    # Add target columns
    weekly_df['target'] = next_week_close
    weekly_df['target_original'] = next_week_close_original
    weekly_df['pct_change_target'] = pct_change_next_week
    
    # Create binary target for price movement direction
    weekly_df['price_up'] = (weekly_df['pct_change_target'] > 0).astype(int)
    
    # Drop rows with NaN target values
    weekly_df = weekly_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['week'], f"{stock_symbol}_xgboost_week.csv")
    weekly_df.to_csv(output_path, index=False)
    
    return output_path

In [None]:
def prepare_monthly_prediction_data(df, stock_symbol):
    """
    Prepare data for next month average close price prediction.
    We'll calculate the average close price for the next 21 trading days (approx. one month).
    """
    # Make a copy to avoid modifications to the original dataframe
    monthly_df = df.copy()
    
    # Calculate the average close price for the next 21 trading days
    # First, get the original close prices (non-normalized)
    close_original = monthly_df['close_original'].values
    
    # Initialize arrays for targets
    next_month_close = np.full(len(monthly_df), np.nan)
    next_month_close_original = np.full(len(monthly_df), np.nan)
    pct_change_next_month = np.full(len(monthly_df), np.nan)
    
    # For each day, calculate the average of the next 21 trading days
    for i in range(len(monthly_df) - 21):
        next_month_close[i] = np.mean(monthly_df['close'].values[i+1:i+22])
        next_month_close_original[i] = np.mean(close_original[i+1:i+22])
        # Calculate percentage change from current close to next month average
        pct_change_next_month[i] = (next_month_close_original[i] / close_original[i]) - 1
    
    # Add target columns
    monthly_df['target'] = next_month_close
    monthly_df['target_original'] = next_month_close_original
    monthly_df['pct_change_target'] = pct_change_next_month
    
    # Create binary target for price movement direction
    monthly_df['price_up'] = (monthly_df['pct_change_target'] > 0).astype(int)
    
    # Drop rows with NaN target values
    monthly_df = monthly_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['month'], f"{stock_symbol}_xgboost_month.csv")
    monthly_df.to_csv(output_path, index=False)
    
    return output_path

In [None]:
def process_stock(stock_symbol):
    """
    Process a single stock symbol to create all three prediction datasets.
    """
    print(f"Processing {stock_symbol}...")
    
    # Construct the input file path
    input_file = os.path.join(input_folder, f"{stock_symbol}_xgboost.csv")
    
    # Check if the file exists
    if not os.path.exists(input_file):
        print(f"Warning: {input_file} not found. Skipping {stock_symbol}.")
        return None
    
    # Read the CSV file
    df = pd.read_csv(input_file, parse_dates=['date'])
    
    # Process data for each prediction period
    daily_output = prepare_daily_prediction_data(df, stock_symbol)
    weekly_output = prepare_weekly_prediction_data(df, stock_symbol)
    monthly_output = prepare_monthly_prediction_data(df, stock_symbol)
    
    print(f"Created prediction datasets for {stock_symbol}:")
    print(f"  - Daily: {daily_output}")
    print(f"  - Weekly: {weekly_output}")
    print(f"  - Monthly: {monthly_output}")
    
    return {
        'day': daily_output,
        'week': weekly_output,
        'month': monthly_output
    }

In [None]:
# Process all stocks
results = {}
for stock in stocks:
    results[stock] = process_stock(stock)

In [None]:
# Print summary
print("\nProcessing Summary:")
print(f"Total stocks processed: {len([r for r in results.values() if r is not None])}")
print(f"Failed to process: {len([r for r in results.values() if r is None])}")

In [None]:
def validate_datasets():
    """
    Validate the created datasets to ensure they have the expected structure.
    """
    print("\nValidating datasets...")
    
    for period in ['day', 'week', 'month']:
        folder = output_folders[period]
        files = os.listdir(folder)
        if not files:
            print(f"Warning: No files found in {folder}")
            continue
            
        # Check the first file as a sample
        sample_file = os.path.join(folder, files[0])
        sample_df = pd.read_csv(sample_file)
        
        # Check for required columns
        required_cols = ['date', 'target', 'target_original', 'pct_change_target', 'price_up']
        missing = [col for col in required_cols if col not in sample_df.columns]
        
        if missing:
            print(f"Warning: {period} datasets are missing columns: {missing}")
        else:
            print(f"{period} datasets validation passed!")
            print(f"  - Sample file: {files[0]}")
            print(f"  - Row count: {len(sample_df)}")
            print(f"  - Target mean: {sample_df['target'].mean():.4f}")
    
    print("\nValidation complete!")

In [None]:
# Run validation
validate_datasets()

## Example Analysis

Let's examine a sample of the generated datasets to confirm they're structured correctly.

In [None]:
# Try to load and display samples from each time period if available
try:
    # Get first available stock
    available_stocks = [stock for stock in stocks if results.get(stock) is not None]
    if available_stocks:
        sample_stock = available_stocks[0]
        
        print(f"Sample analysis for {sample_stock}:")
        
        # Load samples from each period
        for period in ['day', 'week', 'month']:
            file_path = os.path.join(output_folders[period], f"{sample_stock}_xgboost_{period}.csv")
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, parse_dates=['date'])
                print(f"\n{period.capitalize()} prediction dataset:")
                print(f"  - Shape: {df.shape}")
                print(f"  - Date range: {df['date'].min()} to {df['date'].max()}")
                print(f"  - Percent of price increases: {df['price_up'].mean()*100:.2f}%")
                print("\nSample rows:")
                print(df[['date', 'close_original', 'target_original', 'pct_change_target', 'price_up']].head())
    else:
        print("No processed stocks available for sample analysis.")
except Exception as e:
    print(f"Error during sample analysis: {e}")

# XGBoost Model Training for Stock Price Prediction

This notebook trains XGBoost models for stock price prediction for three different time periods:
1. Next day close price
2. Next week average close price
3. Next month average close price

Models will be trained for each stock and each period, resulting in 60 models. The trained models will be saved in the `models/xgboost/{period}/` directory.

In [6]:

periods = ['day', 'week', 'month']

# Paths for input data and output models
input_base_folder = os.path.join(project_root, "data", "xgboost")
output_base_folder_models = os.path.join(project_root, "models", "xgboost")

# Ensure output model directories exist
for period in periods:
    os.makedirs(os.path.join(output_base_folder_models, period), exist_ok=True)

print(f"Project Root: {project_root}")
print(f"Input Data Base Folder: {input_base_folder}")
print(f"Output Models Base Folder: {output_base_folder_models}")
print("Setup complete. Model output directories created/verified.")

Project Root: /home/abderrahmane/Stock-Market-Predictor
Input Data Base Folder: /home/abderrahmane/Stock-Market-Predictor/data/xgboost
Output Models Base Folder: /home/abderrahmane/Stock-Market-Predictor/models/xgboost
Setup complete. Model output directories created/verified.


In [8]:
def split_time_series_data(df, target_column='target', test_size=0.2):
    """
    Splits time-series data into training and testing sets.
    Data is assumed to be sorted chronologically by 'date'.
    """
    if 'date' in df.columns:
        df = df.sort_values(by='date').reset_index(drop=True)
    
    # Define features: all columns except target and other specified ones
    # 'price_up' might be missing in daily data; it will be excluded if not present.
    excluded_cols = {target_column, 'target_original', 'pct_change_target', 'price_up', 'date'}
    feature_cols = [col for col in df.columns if col not in excluded_cols and col in df.columns] # check col in df.columns
    
    X = df[feature_cols]
    y = df[target_column]
    
    # Calculate split index for chronological split
    split_idx = int(len(df) * (1 - test_size))
    
    X_train = X.iloc[:split_idx]
    y_train = y.iloc[:split_idx]
    X_test = X.iloc[split_idx:]
    y_test = y.iloc[split_idx:]
    
    return X_train, y_train, X_test, y_test, feature_cols

In [9]:
def train_and_save_xgboost_model(X_train, y_train, X_test, y_test, model_path, xgb_params=None):
    """
    Trains an XGBoost regressor model, evaluates it, and saves it.
    """
    if xgb_params is None:
        # Default parameters - consider tuning these
        xgb_params = {
            'objective': 'reg:squarederror', # For regression
            'n_estimators': 100,             # Number of trees
            'learning_rate': 0.1,            # Step size shrinkage
            'max_depth': 3,                  # Maximum depth of a tree
            'subsample': 0.8,                # Subsample ratio of the training instance
            'colsample_bytree': 0.8,         # Subsample ratio of columns when constructing each tree
            'random_state': 42,
            'n_jobs': -1                     # Use all available cores
        }

    model = xgb.XGBRegressor(**xgb_params)
    
    # Train the model with early stopping
    # Note: Using X_test, y_test for eval_set is common for simplicity here,
    # but a dedicated validation set is best practice for early stopping and tuning.
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              early_stopping_rounds=10, # Stop if no improvement on eval set after 10 rounds
              verbose=False)            # Set to True or a number to see training progress

    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"  Test RMSE: {rmse:.4f}")
    
    # Save the model
    model.save_model(model_path)
    print(f"  Model saved to {model_path}")
    
    return model, rmse

In [10]:
# Main loop to process each stock and period

model_performance_summary = []

for stock in stocks:
    for period in periods:
        print(f"\nProcessing: Stock = {stock}, Period = {period}")
        
        # 1. Load data
        csv_file_name = f"{stock}_xgboost_{period}.csv"
        csv_file_path = os.path.join(input_base_folder, period, csv_file_name)
        
        if not os.path.exists(csv_file_path):
            print(f"  Data file not found: {csv_file_path}. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': 'data_not_found', 'rmse': np.nan, 
                'model_path': '', 'num_features': 0, 'train_samples': 0, 'test_samples': 0
            })
            continue
            
        try:
            df = pd.read_csv(csv_file_path, parse_dates=['date'])
        except Exception as e:
            print(f"  Error loading data for {stock} {period}: {e}. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': f'load_error: {e}', 'rmse': np.nan,
                'model_path': '', 'num_features': 0, 'train_samples': 0, 'test_samples': 0
            })
            continue

        if df.empty:
            print(f"  Data file is empty: {csv_file_path}. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': 'data_empty', 'rmse': np.nan,
                'model_path': '', 'num_features': 0, 'train_samples': 0, 'test_samples': 0
            })
            continue
            
        if 'target' not in df.columns:
            print(f"  'target' column not found in {csv_file_path}. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': 'target_missing', 'rmse': np.nan,
                'model_path': '', 'num_features': 0, 'train_samples': 0, 'test_samples': 0
            })
            continue
        
        # Minimum number of rows to attempt training
        if len(df) < 20: 
            print(f"  Not enough data for {stock} {period} (rows: {len(df)}). Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': 'insufficient_data', 'rmse': np.nan,
                'model_path': '', 'num_features': 0, 'train_samples': 0, 'test_samples': len(df)
            })
            continue

        # 2. Split data
        X_train, y_train, X_test, y_test, feature_names = split_time_series_data(
            df, target_column='target', test_size=0.2
        )
        
        if X_train.empty or X_test.empty or y_train.empty or y_test.empty:
            print(f"  Train or test set is empty after split for {stock} {period}. Skipping.")
            status = 'split_error_empty_set'
            if X_train.empty: status += '_train_empty'
            if X_test.empty: status += '_test_empty'
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': status, 'rmse': np.nan,
                'model_path': '', 'num_features': len(feature_names), 
                'train_samples': len(X_train), 'test_samples': len(X_test)
            })
            continue
        
        if not feature_names:
            print(f"  No features found for {stock} {period} after excluding columns. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': 'no_features', 'rmse': np.nan,
                'model_path': '', 'num_features': 0, 
                'train_samples': len(X_train), 'test_samples': len(X_test)
            })
            continue

        print(f"  Features ({len(feature_names)}): {', '.join(feature_names[:5])}{'...' if len(feature_names) > 5 else ''}")
        print(f"  Train shape: X={X_train.shape}, y={y_train.shape}. Test shape: X={X_test.shape}, y={y_test.shape}")

        # 3. Train and save model
        model_file_name = f"xgboost_{stock}_{period}.model"
        model_output_path = os.path.join(output_base_folder_models, period, model_file_name)
        
        # Optimized XGBoost parameters (a starting point, further tuning recommended)
        xgb_params = {
            'objective': 'reg:squarederror',
            'n_estimators': 500,        # Increased, but early stopping will manage it
            'learning_rate': 0.05,       # Smaller learning rate
            'max_depth': 5,              # Moderately deep trees
            'subsample': 0.7,            # Fraction of observations to be randomly samples for each tree.
            'colsample_bytree': 0.7,     # Fraction of columns to be randomly samples for each tree.
            'random_state': 42,
            'n_jobs': -1,                # Use all CPU cores
            # 'tree_method': 'hist',     # Often faster for larger datasets, requires testing
            'alpha': 0.1,                # L1 regularization
            'lambda': 0.1                # L2 regularization
        }

        try:
            model, rmse = train_and_save_xgboost_model(
                X_train, y_train, X_test, y_test, 
                model_output_path, 
                xgb_params=xgb_params
            )
            model_performance_summary.append({
                'stock': stock, 
                'period': period, 
                'status': 'success',
                'rmse': rmse, 
                'model_path': model_output_path,
                'num_features': len(feature_names),
                'train_samples': len(X_train),
                'test_samples': len(X_test)
            })
        except Exception as e:
            print(f"  Error training model for {stock} {period}: {e}. Skipping.")
            model_performance_summary.append({
                'stock': stock, 'period': period, 'status': f'training_error: {e}', 'rmse': np.nan,
                'model_path': model_output_path, # Path where it would have been saved
                'num_features': len(feature_names), 
                'train_samples': len(X_train), 'test_samples': len(X_test)
            })
            continue
            
print("\n--- All processing complete ---")


Processing: Stock = AAPL, Period = day
  Features (33): open, high, low, close, volume...
  Train shape: X=(2010, 33), y=(2010,). Test shape: X=(503, 33), y=(503,)
  Error training model for AAPL day: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'. Skipping.

Processing: Stock = AAPL, Period = week
  Features (36): open, high, low, close, volume...
  Train shape: X=(2006, 36), y=(2006,). Test shape: X=(502, 36), y=(502,)
  Error training model for AAPL week: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'. Skipping.

Processing: Stock = AAPL, Period = month
  Features (36): open, high, low, close, volume...
  Train shape: X=(1993, 36), y=(1993,). Test shape: X=(499, 36), y=(499,)
  Error training model for AAPL month: XGBModel.fit() got an unexpected keyword argument 'early_stopping_rounds'. Skipping.

Processing: Stock = MSFT, Period = day
  Features (33): open, high, low, close, volume...
  Train shape: X=(2010, 33), y=(2010,). Te

In [None]:
# Display summary of trained models
if model_performance_summary:
    performance_df = pd.DataFrame(model_performance_summary)
    print("\n--- Model Training Summary ---")
    
    # Display a concise summary
    print(f"Total models attempted: {len(performance_df)}")
    print(f"Successfully trained models: {len(performance_df[performance_df['status'] == 'success'])}")
    
    print("\nDetails per model:")
    print(performance_df[['stock', 'period', 'status', 'rmse', 'num_features', 'train_samples', 'test_samples']].to_string())
    
    # Save summary to CSV
    summary_csv_path = os.path.join(project_root, "models", "xgboost_training_summary.csv")
    try:
        performance_df.to_csv(summary_csv_path, index=False)
        print(f"\nTraining summary saved to: {summary_csv_path}")
    except Exception as e:
        print(f"\nError saving training summary: {e}")
else:
    print("\nNo models were attempted or no summary data collected.")

print("\n--- Recommendations ---")
print("1. Review the 'status' column in the summary for any issues during processing.")
print("2. The XGBoost parameters provided are a general starting point. For optimal performance,")
print("   consider implementing systematic hyperparameter tuning (e.g., using GridSearchCV,")
print("   RandomizedSearchCV, or Bayesian optimization with tools like Optuna) for each model or period.")
print("3. Ensure your input data CSVs are correctly formatted and contain sufficient historical data.")
print("4. A dedicated validation set (split from the training data) is recommended for more robust")
print("   early stopping and hyperparameter tuning, rather than using the test set for early stopping.")