# Data Preparation for Prophet Model

This notebook prepares the cleaned stock data for use with the Prophet model. Prophet requires specific formatting:
- A column named 'ds' containing dates
- A column named 'y' containing the target values (closing prices)

We'll process each of the 20 stock datasets and save them in the appropriate format.

In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Define Stock List and Data Paths

In [2]:
# List of stock symbols
stocks = ["AAPL", "MSFT", "GOOG", "AMZN", "TSLA", 
          "META", "NVDA", "SPY", "V", "DIS",
          "NFLX", "PYPL", "BABA", "IBM", "AMD",
          "BA", "INTC", "T", "GS", "NKE"]

# Paths for input and output data
input_folder = "data/cleaned"
output_folder = "data/prophet"

# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

## 2. Prophet Data Preparation Function

Prophet requires a specific format with two main columns:
- `ds`: Date column
- `y`: Target value (typically closing price for stock prediction)

In [3]:
def prepare_prophet_data(df):
    """
    Prepare stock data for Prophet model.
    
    Parameters:
    - df: Cleaned dataframe with stock data
    
    Returns:
    - DataFrame formatted for Prophet
    """
    # Create a copy to avoid modifying the original
    prophet_df = df.copy()
    
    # Prophet requires columns named 'ds' (date) and 'y' (target)
    prophet_df = prophet_df.rename(columns={'date': 'ds', 'close': 'y'})
    
    # Select only needed columns for basic Prophet model
    basic_cols = ['ds', 'y']
    
    # Add additional regressor columns that might help the model
    regressor_cols = [
        'open',
        'high',
        'low',
        'volume',
        'ma5',
        'ma20',
        'ma50',
        'volatility',
        'volume_ma20',
        'return'
    ]
    
    # Combine columns
    selected_cols = basic_cols + [col for col in regressor_cols if col in prophet_df.columns]
    prophet_df = prophet_df[selected_cols]
    
    # Add a prediction target column for 5 days ahead (can be used for longer forecasts)
    if 'next_day_close' in df.columns:
        prophet_df['next_day_y'] = df['next_day_close']
        prophet_df['price_up'] = df['price_up']
    
    return prophet_df

## 3. Process Each Stock and Save Prophet Format Data

In [4]:
results_summary = []

for stock in stocks:
    try:
        # Input file path
        input_file = f"{input_folder}/{stock}_cleaned.csv"
        
        # Check if file exists
        if not os.path.exists(input_file):
            print(f"Warning: {input_file} does not exist. Skipping {stock}.")
            continue
        
        # Read cleaned data
        df = pd.read_csv(input_file)
        
        # Ensure date column is in datetime format
        df['date'] = pd.to_datetime(df['date'])
        
        # Prepare data for Prophet
        prophet_df = prepare_prophet_data(df)
        
        # Output file path
        output_file = f"{output_folder}/{stock}_prophet.csv"
        
        # Save to CSV
        prophet_df.to_csv(output_file, index=False)
        
        # Collect summary statistics
        summary = {
            'stock': stock,
            'rows': len(prophet_df),
            'start_date': prophet_df['ds'].min().strftime('%Y-%m-%d'),
            'end_date': prophet_df['ds'].max().strftime('%Y-%m-%d'),
            'min_price': round(prophet_df['y'].min(), 2),
            'max_price': round(prophet_df['y'].max(), 2),
            'avg_price': round(prophet_df['y'].mean(), 2),
            'file_size_kb': round(os.path.getsize(output_file) / 1024, 2)
        }
        
        results_summary.append(summary)
        print(f"Processed {stock}: {len(prophet_df)} rows saved to {output_file}")
        
    except Exception as e:
        print(f"Error processing {stock}: {str(e)}")

# Create a summary DataFrame
summary_df = pd.DataFrame(results_summary)
print("\nData preparation for Prophet completed!")

## 4. Display Summary of Prepared Datasets

In [5]:
# Display summary table
summary_df

## 5. Examine a Sample Dataset

In [6]:
# Load sample dataset (AAPL)
sample_file = f"{output_folder}/AAPL_prophet.csv"
if os.path.exists(sample_file):
    sample_df = pd.read_csv(sample_file)
    print(f"Sample from AAPL_prophet.csv (First 5 rows):")
    display(sample_df.head())
    
    # Plot the data
    plt.figure(figsize=(12, 6))
    sample_df['ds'] = pd.to_datetime(sample_df['ds'])
    plt.plot(sample_df['ds'], sample_df['y'])
    plt.title('AAPL Stock Price')
    plt.xlabel('Date')
    plt.ylabel('Closing Price ($)')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
else:
    print("Sample file not found. Please run the data preparation first.")

## 6. Data Ready for Prophet

Our data is now ready for training Prophet models. The prepared datasets include:

1. Required Prophet columns (`ds` for dates and `y` for target values)
2. Additional regressors that can be used as external features
3. Next day price information for evaluating forecasts

The data is saved in the `/data/prophet/` directory with filenames like `STOCK_prophet.csv`.