# Stock Price Prediction Data Preparation for XGBoost

This notebook prepares stock data for prediction using XGBoost models for three different time periods:
1. Next day close price
2. Next week average close price
3. Next month average close price

For each stock, we'll create three separate datasets specifically formatted for each prediction period.

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [2]:
# List of stock symbols
stocks = ["AAPL", "MSFT", "GOOG", "AMZN", "TSLA", 
          "META", "NVDA", "SPY", "V", "DIS",
          "NFLX", "PYPL", "BABA", "IBM", "AMD",
          "BA", "INTC", "T", "GS", "NKE"]

# Path to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))

# Paths for input and output data
input_folder = os.path.join(project_root, "data", "xgboost","initial")
output_base_folder = os.path.join(project_root, "data", "xgboost")

# Create output folders for each prediction period
output_folders = {
    'day': os.path.join(output_base_folder, "day"),
    'week': os.path.join(output_base_folder, "week"),
    'month': os.path.join(output_base_folder, "month")
}

# Ensure output directories exist
for folder in output_folders.values():
    os.makedirs(folder, exist_ok=True)

In [3]:
def prepare_daily_prediction_data(df, stock_symbol):
    """
    Prepare data for next day close price prediction.
    The next_day_close and next_day_close_original columns are already present.
    """
    # Make a copy to avoid modifications to the original dataframe
    daily_df = df.copy()
    
    # Rename target columns for clarity
    daily_df = daily_df.rename(columns={
        'next_day_close': 'target',
        'next_day_close_original': 'target_original',
        'pct_change_next_day': 'pct_change_target'
    })
    
    # Drop rows with NaN target values (typically the last row)
    daily_df = daily_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['day'], f"{stock_symbol}_xgboost_day.csv")
    daily_df.to_csv(output_path, index=False)
    
    return output_path

In [4]:
def prepare_weekly_prediction_data(df, stock_symbol):
    """
    Prepare data for next week average close price prediction.
    We'll calculate the average close price for the next 5 trading days.
    """
    # Make a copy to avoid modifications to the original dataframe
    weekly_df = df.copy()
    
    # Calculate the average close price for the next 5 trading days
    # First, get the original close prices (non-normalized)
    close_original = weekly_df['close_original'].values
    
    # Initialize arrays for targets
    next_week_close = np.full(len(weekly_df), np.nan)
    next_week_close_original = np.full(len(weekly_df), np.nan)
    pct_change_next_week = np.full(len(weekly_df), np.nan)
    
    # For each day, calculate the average of the next 5 trading days
    for i in range(len(weekly_df) - 5):
        next_week_close[i] = np.mean(weekly_df['close'].values[i+1:i+6])
        next_week_close_original[i] = np.mean(close_original[i+1:i+6])
        # Calculate percentage change from current close to next week average
        pct_change_next_week[i] = (next_week_close_original[i] / close_original[i]) - 1
    
    # Add target columns
    weekly_df['target'] = next_week_close
    weekly_df['target_original'] = next_week_close_original
    weekly_df['pct_change_target'] = pct_change_next_week
    
    # Create binary target for price movement direction
    weekly_df['price_up'] = (weekly_df['pct_change_target'] > 0).astype(int)
    
    # Drop rows with NaN target values
    weekly_df = weekly_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['week'], f"{stock_symbol}_xgboost_week.csv")
    weekly_df.to_csv(output_path, index=False)
    
    return output_path

In [5]:
def prepare_monthly_prediction_data(df, stock_symbol):
    """
    Prepare data for next month average close price prediction.
    We'll calculate the average close price for the next 21 trading days (approx. one month).
    """
    # Make a copy to avoid modifications to the original dataframe
    monthly_df = df.copy()
    
    # Calculate the average close price for the next 21 trading days
    # First, get the original close prices (non-normalized)
    close_original = monthly_df['close_original'].values
    
    # Initialize arrays for targets
    next_month_close = np.full(len(monthly_df), np.nan)
    next_month_close_original = np.full(len(monthly_df), np.nan)
    pct_change_next_month = np.full(len(monthly_df), np.nan)
    
    # For each day, calculate the average of the next 21 trading days
    for i in range(len(monthly_df) - 21):
        next_month_close[i] = np.mean(monthly_df['close'].values[i+1:i+22])
        next_month_close_original[i] = np.mean(close_original[i+1:i+22])
        # Calculate percentage change from current close to next month average
        pct_change_next_month[i] = (next_month_close_original[i] / close_original[i]) - 1
    
    # Add target columns
    monthly_df['target'] = next_month_close
    monthly_df['target_original'] = next_month_close_original
    monthly_df['pct_change_target'] = pct_change_next_month
    
    # Create binary target for price movement direction
    monthly_df['price_up'] = (monthly_df['pct_change_target'] > 0).astype(int)
    
    # Drop rows with NaN target values
    monthly_df = monthly_df.dropna(subset=['target'])
    
    # Save to CSV
    output_path = os.path.join(output_folders['month'], f"{stock_symbol}_xgboost_month.csv")
    monthly_df.to_csv(output_path, index=False)
    
    return output_path

In [8]:
def process_stock(stock_symbol):
    """
    Process a single stock symbol to create all three prediction datasets.
    """
    print(f"Processing {stock_symbol}...")
    
    # Construct the input file path
    input_file = os.path.join(input_folder, f"{stock_symbol}_xgboost.csv")
    
    # Check if the file exists
    if not os.path.exists(input_file):
        print(f"Warning: {input_file} not found. Skipping {stock_symbol}.")
        return None
    
    # Read the CSV file
    df = pd.read_csv(input_file, parse_dates=['date'])
    
    # Process data for each prediction period
    daily_output = prepare_daily_prediction_data(df, stock_symbol)
    weekly_output = prepare_weekly_prediction_data(df, stock_symbol)
    monthly_output = prepare_monthly_prediction_data(df, stock_symbol)
    
    print(f"Created prediction datasets for {stock_symbol}:")
    print(f"  - Daily: {daily_output}")
    print(f"  - Weekly: {weekly_output}")
    print(f"  - Monthly: {monthly_output}")
    
    return {
        'day': daily_output,
        'week': weekly_output,
        'month': monthly_output
    }

In [9]:
# Process all stocks
results = {}
for stock in stocks:
    results[stock] = process_stock(stock)

Processing AAPL...
Created prediction datasets for AAPL:
  - Daily: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/day/AAPL_xgboost_day.csv
  - Weekly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/week/AAPL_xgboost_week.csv
  - Monthly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/month/AAPL_xgboost_month.csv
Processing MSFT...
Created prediction datasets for MSFT:
  - Daily: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/day/MSFT_xgboost_day.csv
  - Weekly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/week/MSFT_xgboost_week.csv
  - Monthly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/month/MSFT_xgboost_month.csv
Processing GOOG...
Created prediction datasets for GOOG:
  - Daily: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/day/GOOG_xgboost_day.csv
  - Weekly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/week/GOOG_xgboost_week.csv
  - Monthly: /home/abderrahmane/Stock-Market-Predictor/data/xgboost/month/GOOG_x

In [10]:
# Print summary
print("\nProcessing Summary:")
print(f"Total stocks processed: {len([r for r in results.values() if r is not None])}")
print(f"Failed to process: {len([r for r in results.values() if r is None])}")


Processing Summary:
Total stocks processed: 20
Failed to process: 0


In [11]:
def validate_datasets():
    """
    Validate the created datasets to ensure they have the expected structure.
    """
    print("\nValidating datasets...")
    
    for period in ['day', 'week', 'month']:
        folder = output_folders[period]
        files = os.listdir(folder)
        if not files:
            print(f"Warning: No files found in {folder}")
            continue
            
        # Check the first file as a sample
        sample_file = os.path.join(folder, files[0])
        sample_df = pd.read_csv(sample_file)
        
        # Check for required columns
        required_cols = ['date', 'target', 'target_original', 'pct_change_target', 'price_up']
        missing = [col for col in required_cols if col not in sample_df.columns]
        
        if missing:
            print(f"Warning: {period} datasets are missing columns: {missing}")
        else:
            print(f"{period} datasets validation passed!")
            print(f"  - Sample file: {files[0]}")
            print(f"  - Row count: {len(sample_df)}")
            print(f"  - Target mean: {sample_df['target'].mean():.4f}")
    
    print("\nValidation complete!")

In [12]:
# Run validation
validate_datasets()


Validating datasets...
day datasets validation passed!
  - Sample file: V_xgboost_day.csv
  - Row count: 2513
  - Target mean: 173.3284
week datasets validation passed!
  - Sample file: NKE_xgboost_week.csv
  - Row count: 2508
  - Target mean: 0.0023
month datasets validation passed!
  - Sample file: AMD_xgboost_month.csv
  - Row count: 2491
  - Target mean: 0.0024

Validation complete!


## Example Analysis

Let's examine a sample of the generated datasets to confirm they're structured correctly.

In [13]:
# Try to load and display samples from each time period if available
try:
    # Get first available stock
    available_stocks = [stock for stock in stocks if results.get(stock) is not None]
    if available_stocks:
        sample_stock = available_stocks[0]
        
        print(f"Sample analysis for {sample_stock}:")
        
        # Load samples from each period
        for period in ['day', 'week', 'month']:
            file_path = os.path.join(output_folders[period], f"{sample_stock}_xgboost_{period}.csv")
            if os.path.exists(file_path):
                df = pd.read_csv(file_path, parse_dates=['date'])
                print(f"\n{period.capitalize()} prediction dataset:")
                print(f"  - Shape: {df.shape}")
                print(f"  - Date range: {df['date'].min()} to {df['date'].max()}")
                print(f"  - Percent of price increases: {df['price_up'].mean()*100:.2f}%")
                print("\nSample rows:")
                print(df[['date', 'close_original', 'target_original', 'pct_change_target', 'price_up']].head())
    else:
        print("No processed stocks available for sample analysis.")
except Exception as e:
    print(f"Error during sample analysis: {e}")

Sample analysis for AAPL:

Day prediction dataset:
  - Shape: (2513, 38)
  - Date range: 2015-05-13 04:00:00+00:00 to 2025-05-08 04:00:00+00:00
  - Percent of price increases: 52.96%

Sample rows:
                       date  close_original  target_original  \
0 2015-05-13 04:00:00+00:00       28.259068        28.918398   
1 2015-05-14 04:00:00+00:00       28.918398        28.878025   
2 2015-05-15 04:00:00+00:00       28.878025        29.196480   
3 2015-05-18 04:00:00+00:00       29.196480        29.169573   
4 2015-05-19 04:00:00+00:00       29.169573        29.167328   

   pct_change_target  price_up  
0          -0.001396         1  
1           0.011028         0  
2          -0.000922         1  
3          -0.000077         0  
4           0.010226         0  

Week prediction dataset:
  - Shape: (2508, 41)
  - Date range: 2015-05-13 04:00:00+00:00 to 2025-05-01 04:00:00+00:00
  - Percent of price increases: 57.58%

Sample rows:
                       date  close_original  tar