In [1]:
!pip install -q tensorflow

In [2]:
# Import standard library modules
import sys

# Add the project root to the system path for importing in-house modules
sys.path.append("../../")

# Import in-house modules from the 'utilities' package
from utilities import split_dataset_by_date, clean_historical_data, check_tickers_for_missing_values
from utilities import load_data

In [3]:
# Data manipulation and analysis
import pandas as pd
import matplotlib.pyplot as plt

# Date and time manipulation
from datetime import date

# File and directory manipulation
from pathlib import Path

# Data preprocessing and model selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Import necessary Keras modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

In [None]:
raw_data = load_data("../../data/raw_data/sp500_adj_close_raw_with_nas")

### Data Pre-Processing
___

#### Split todays data (For prediction) and historical data (For training)

In [None]:
todays_date = "2024-10-25"

historical_data, todays_data = split_dataset_by_date(raw_data, todays_date)

print("Todays Date:", todays_date)

#### Ensure there are missing values on Todays Data is what we are predicting...

In [None]:
todays_data.isnull().sum()

#### Handle missing values (NA's) on the historical data used to train and test...

In [None]:
historical_data = clean_historical_data(historical_data)

historical_data.isnull().sum()

In [None]:
tickers_no_missing_values, tickers_with_missing_values = check_tickers_for_missing_values(historical_data)

### Exploratory Data Analysis (EDA):
___

#### Todays Data:

In [None]:
print("Today's Data Shape:", todays_data.shape)

#### Historical Data:

In [None]:
print("Historical Data Shape:", historical_data.shape)

### Create Multiple Versions of Dataset
___

#### Select which version of the data to work with

In [11]:
# Data with dates and without tickers (Set as index for reference)
def prepare_data_v2(main_data: pd.DataFrame) -> pd.DataFrame:
    # Create a copy of the input DataFrame
    df = main_data.copy().reset_index(drop=True)
    
    # Convert the `Date` column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Extract year, month, and day from the `Date` column
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    
    # Set the index to `Date` and `Ticker`
    df = df.set_index(["Date", "Ticker"])
    
    return df

main_data = prepare_data_v2(historical_data.copy().reset_index(drop=True))

##### **Note: It is recommended to remove `["Previous Day Close", "Resistance", "Upper Band", "SMA_50", "SMA_200"]` after VIF inspection...**

##### **Note: It is recommended to remove `["Day"]` after p-value inspection...**

In [None]:
select_data = main_data.copy()

select_columns_to_drop = ["Action", "Previous Day Close", "SMA_50", "Resistance", "Upper Band", "SMA_200", "Day"]

data = select_data.drop(columns=select_columns_to_drop)

print("Shape:", data.shape)
data.head()

### Split data features `X` and target `y`
___

In [None]:
# Split the data into features (X) and target (y)
X = data.drop(columns="Next Day Close")

y = data["Next Day Close"]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

### Data Splitting
___

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2, # 80% training and 20% testing
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

### Scaling the Data
___

#### Scale the data using `StandardScaler`

In [None]:
# Scale using StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train_scaled = X_scaler.fit_transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train_scaled shape:", y_train_scaled.shape)
print("y_test_scaled shape:", y_test_scaled.shape)

### Model Training
___


In [None]:
# Define the neural network model
def create_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=input_shape),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)  # Output layer for regression
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

# Create early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Initialize the model
input_shape = (X_train_scaled.shape[1],)
keras_model = create_model(input_shape)

# Train the model
history = keras_model.fit(
    X_train_scaled,
    y_train_scaled,
    validation_split=0.2,
    epochs=50,
    batch_size=1024,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Evaluate the model
train_predictions_scaled = keras_model.predict(X_train_scaled)
test_predictions_scaled = keras_model.predict(X_test_scaled)

# Inverse transform predictions to original scale
train_predictions = y_scaler.inverse_transform(train_predictions_scaled)
test_predictions = y_scaler.inverse_transform(test_predictions_scaled)

train_mse = mean_squared_error(y_train, train_predictions)
train_rmse = np.sqrt(train_mse)
train_mae = mean_absolute_error(y_train, train_predictions)
train_r2 = r2_score(y_train, train_predictions)

test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)
test_mae = mean_absolute_error(y_test, test_predictions)
test_r2 = r2_score(y_test, test_predictions)

print("\nTraining Set Metrics:")
print(f"RMSE: {train_rmse:.4f}")
print(f"MAE: {train_mae:.4f}")
print(f"R²: {train_r2:.4f}")

print("\nTest Set Metrics:")
print(f"RMSE: {test_rmse:.4f}")
print(f"MAE: {test_mae:.4f}")
print(f"R²: {test_r2:.4f}")

In [None]:
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()