In [166]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, KFold

# Data Collection and Preprocessing

In [169]:
# Define tickers and date range
tickers = ['^GSPC', '^IXIC', '^N225']
start_date = '2020-01-01'
end_date = '2022-01-01'

# Fetch data
data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker')

# Drop 'Close' from each ticker in the DataFrame
for ticker in tickers:
    if (ticker, 'Close') in data.columns:  # Check if the 'Close' column exists
        data.drop(columns=[(ticker, 'Close')], inplace=True)  # Drop the 'Close' column

# Apply LOCF for missing values across the entire dataset
data = data.ffill()

[*********************100%***********************]  3 of 3 completed


# Feature Engineering

The following code calculates several technical indicators for stock data, including Simple Moving Average (SMA), Relative Strength Index (RSI), Moving Average Convergence Divergence (MACD), and Bollinger Bands for multiple stocks. These indicators are widely used in the analysis of financial markets to help identify market trends and potential trading opportunities.

## Indicators

### 1. Simple Moving Average (SMA)
- **Purpose**: Provides a smooth data series by calculating the average of a specified number of past prices.
- **Calculation**: The 30-day SMA is calculated, which represents the average of the last 30 closing prices.

### 2. Relative Strength Index (RSI)
- **Purpose**: Measures the velocity and magnitude of directional price movements. It provides signals about bullish or bearish price momentum.
- **Calculation**: The 14-day RSI is computed, which involves:
  - Identifying the magnitude of recent gains and losses.
  - Calculating the average of these gains and losses over 14 days.
  - Normalizing the result to an oscillator between 0 and 100.

### 3. Moving Average Convergence Divergence (MACD)
- **Purpose**: Tracks the relationship between two moving averages of a stock's price.
- **Calculation**:
  - The MACD line is calculated as the difference between the 12-day and 26-day exponential moving averages (EMA) of the closing prices.
  - A signal line, which is the 9-day EMA of the MACD line itself, is also calculated. This acts as a trigger for buy and sell signals.

### 4. Bollinger Bands
- **Purpose**: Measures market volatility and provides insights into price levels relative to previous trades.
- **Calculation**:
  - A 20-day SMA of the closing prices forms the middle band.
  - The upper and lower bands are then set two standard deviations above and below this middle band, respectively.
 
### Dropping NaN Values in Financial Data

'NaN' values in technical indicators like SMA, RSI, and Bollinger Bands typically occur because there isn't enough historical data to perform the calculation (e.g., not enough past days for a 30-day SMA at the start of a dataset). Removing these 'NaN' values is beneficial as it ensures that all analyses and models are based only on periods where full data is available, thereby enhancing the reliability and validity of financial assessments and predictions.


In [172]:
# Assuming 'data' is your DataFrame

# Define the window for moving averages and other indicators
window_sma = 30
window_rsi = 14
window_macd_short = 12
window_macd_long = 26
signal_line_window = 9
window_boll = 20

for ticker in tickers:
    # Get the position of 'Adj Close' for each ticker to start inserting new features after it
    position = data.columns.get_loc((ticker, 'Adj Close')) + 1

    # Insert SMA
    sma_series = data[(ticker, 'Adj Close')].rolling(window=window_sma).mean()
    data.insert(loc=position, column=(ticker, 'SMA_30'), value=sma_series)

    # Insert RSI
    delta = data[(ticker, 'Adj Close')].diff(1)
    gain = delta.where(delta > 0, 0).rolling(window=window_rsi).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=window_rsi).mean()
    RS = gain / loss
    rsi_series = 100 - (100 / (1 + RS))
    data.insert(loc=position + 1, column=(ticker, 'RSI_14'), value=rsi_series)

    # Insert MACD and Signal Line
    exp1 = data[(ticker, 'Adj Close')].ewm(span=window_macd_short, adjust=False).mean()
    exp2 = data[(ticker, 'Adj Close')].ewm(span=window_macd_long, adjust=False).mean()
    macd = exp1 - exp2
    data.insert(loc=position + 2, column=(ticker, 'MACD'), value=macd)
    data.insert(loc=position + 3, column=(ticker, 'Signal_Line'), value=macd.ewm(span=signal_line_window, adjust=False).mean())

    # Insert Bollinger Bands
    sma = data[(ticker, 'Adj Close')].rolling(window=window_boll).mean()
    rstd = data[(ticker, 'Adj Close')].rolling(window=window_boll).std()
    data.insert(loc=position + 4, column=(ticker, 'Upper_Band'), value=sma + 2 * rstd)
    data.insert(loc=position + 5, column=(ticker, 'Lower_Band'), value=sma - 2 * rstd)

# Drop rows where any of the specified indicators are NaN
data.dropna(subset=[
    ('^GSPC', 'SMA_30'), 
    ('^GSPC', 'RSI_14'), 
    ('^GSPC', 'Upper_Band'), 
    ('^GSPC', 'Lower_Band'),
    ('^IXIC', 'SMA_30'),
    ('^IXIC', 'RSI_14'),
    ('^IXIC', 'Upper_Band'),
    ('^IXIC', 'Lower_Band'),
    ('^N225', 'SMA_30'),
    ('^N225', 'RSI_14'),
    ('^N225', 'Upper_Band'),
    ('^N225', 'Lower_Band')
    ], inplace=True)

# Normalization

## Overview
Normalization is a critical preprocessing step in data analysis, and for this dataset, we use the **MinMaxScaler** from Python's scikit-learn library to adjust the scales of features. This type of normalization transforms features by scaling them to a given range, specifically between 0 and 1. This approach is particularly beneficial for financial datasets as it preserves the relationships among data points while standardizing the range. By normalizing features to a common scale, we prevent variables with larger scales from dominating the model's behavior, which is crucial for algorithms sensitive to input scale such as neural networks and distance-based algorithms like k-NN.


## Features Normalized
- **Price-related Features**: 'Open', 'High', 'Low', and 'Adj Close' are normalized because these features vary significantly in magnitude and can skew the performance of machine learning models.
- **Volume**: Since trade volume can range over several orders of magnitude, normalizing this feature helps to maintain its proportional impact relative to price features.
- **Derived Indicators (SMA, Bollinger Bands)**: These are based on price data and share its scale. Normalizing these alongside price data ensures consistency in scale across all price-related features.

## Features Not Normalized
- **RSI**: This indicator ranges from 0 to 100, representing overbought and oversold conditions. Normalizing RSI would strip it of its interpretative value.
- **MACD**: Though derived from price, the MACD is a difference between two EMAs and its value (including its signal line) has meaning in its scale relative to zero. Normalizing these would disrupt their threshold-based interpretation.


The decision to normalize certain features while excluding others is based on maintaining the utility and interpretability of each feature. By standardizing the scale of direct measurements and derived statistics based on those measurements, we enhance model reliability without compromising the data's inherent signals.


In [175]:
def normalize_data(df, features):
    scaler = MinMaxScaler()
    # Normalize the specified features, ensuring they are referenced correctly
    df.loc[:, features] = scaler.fit_transform(df[features])
    return df

# Apply normalization to each DataFrame, correctly referencing multi-level columns
for ticker in tickers:
    # Specify multi-level columns including the ticker to ensure correct reference
    features_to_normalize = [(ticker, 'Open'), (ticker, 'High'), (ticker, 'Low'), 
                             (ticker, 'Close'), (ticker, 'Adj Close'), (ticker, 'Volume'),
                             (ticker, 'SMA_30'), (ticker, 'Upper_Band'), (ticker, 'Lower_Band')]
    # Filter to ensure only existing columns are selected and passed
    existing_features = [feature for feature in features_to_normalize if feature in data.columns]

    # Check if there are valid features to normalize
    if existing_features:
        # Normalize using the correct multi-level column references
        data.loc[:, existing_features] = normalize_data(data.loc[:, existing_features], existing_features)

In [177]:
# Separate DataFrames for each ticker
gspc_df = data['^GSPC']
ixic_df = data['^IXIC']
n225_df = data['^N225']

# Random Forest Regressor

## 10-fold cross validation

In [179]:
# Create the Random Forest model
rf_regressor = RandomForestRegressor(
    n_estimators=100,             # Increased number of trees
    # bootstrap=True,               # Use bootstrap samples
    random_state=42               # For reproducibility
)

# Set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define custom scorers
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
r2_scorer = make_scorer(r2_score)

## RFR on S&P Data

In [181]:
# We need to define the features and target. Let's assume 'Adj Close' is the target
features_gspc = gspc_df.drop(columns=['Adj Close'])
target_gspc = gspc_df['Adj Close']

# Perform cross-validation and compute scores
mse_scores_gspc = cross_val_score(rf_regressor, features_gspc, target_gspc, cv=kf, scoring=mse_scorer)
mae_scores_gspc = cross_val_score(rf_regressor, features_gspc, target_gspc, cv=kf, scoring=mae_scorer)
r2_scores_gspc = cross_val_score(rf_regressor, features_gspc, target_gspc, cv=kf, scoring=r2_scorer)

# Output results
print("MSE scores for S&P 500:", -mse_scores_gspc)
print("MAE scores for S&P 500:", -mae_scores_gspc)
print("R^2 scores for S&P 500:", r2_scores_gspc)

MSE scores for S&P 500: [1.02055648e-04 1.56325018e-04 2.02303527e-04 1.67351061e-04
 9.38132241e-05 6.52813349e-05 5.47071782e-05 6.97905402e-05
 5.28237382e-05 1.28197828e-04]
MAE scores for S&P 500: [0.00735436 0.00731872 0.00976686 0.00765436 0.00733246 0.00598535
 0.00561658 0.00608296 0.00523029 0.00706106]
R^2 scores for S&P 500: [0.99843147 0.99752246 0.99731301 0.99722168 0.99791196 0.99830136
 0.9990156  0.99877069 0.99901452 0.99791266]


## RFR on NASDAQ Data

In [185]:
# We need to define the features and target. Let's assume 'Adj Close' is the target
features_ixic = ixic_df.drop(columns=['Adj Close'])
target_ixic = ixic_df['Adj Close']

# Perform cross-validation and compute scores
mse_scores_ixic = cross_val_score(rf_regressor, features_ixic, target_ixic, cv=kf, scoring=mse_scorer)
mae_scores_ixic = cross_val_score(rf_regressor, features_ixic, target_ixic, cv=kf, scoring=mae_scorer)
r2_scores_ixic = cross_val_score(rf_regressor, features_ixic, target_ixic, cv=kf, scoring=r2_scorer)

# Output results
print("MSE scores for NASDAQ:", -mse_scores_ixic)
print("MAE scores for NASDAQ:", -mae_scores_ixic)
print("R^2 scores for NASDAQ:", r2_scores_ixic)

MSE scores for NASDAQ: [9.42493984e-05 1.10628703e-04 1.12016697e-04 1.18103708e-04
 1.19338993e-04 7.62248052e-05 6.97150158e-05 9.25829137e-05
 7.11253507e-05 1.25127717e-04]
MAE scores for NASDAQ: [0.00713573 0.00792831 0.00803004 0.00782663 0.00849072 0.00683342
 0.00649631 0.00728687 0.00705442 0.00808042]
R^2 scores for NASDAQ: [0.99875219 0.99839015 0.9987438  0.99823561 0.99778411 0.99805524
 0.99888824 0.99862247 0.99885101 0.99816513]


In [None]:
## RFR on S&P Data

In [186]:
# We need to define the features and target. Let's assume 'Adj Close' is the target
features_n225 = n225_df.drop(columns=['Adj Close'])
target_n225 = n225_df['Adj Close']

# Perform cross-validation and compute scores
mse_scores_n225 = cross_val_score(rf_regressor, features_n225, target_n225, cv=kf, scoring=mse_scorer)
mae_scores_n225 = cross_val_score(rf_regressor, features_n225, target_n225, cv=kf, scoring=mae_scorer)
r2_scores_n225 = cross_val_score(rf_regressor, features_n225, target_n225, cv=kf, scoring=r2_scorer)

# Output results
print("MSE scores for Nikkei 225:", -mse_scores_n225)
print("MAE scores for Nikkei 225:", -mae_scores_n225)
print("R^2 scores for Nikkei 225:", r2_scores_n225)

MSE scores for Nikkei 225: [1.28546787e-04 5.94961526e-05 2.19048323e-04 1.03056174e-04
 1.02707332e-04 5.99929979e-05 1.28723748e-04 2.20058392e-04
 6.93193904e-05 2.21396407e-04]
MAE scores for Nikkei 225: [0.0089905  0.00522709 0.01015812 0.00722416 0.00702143 0.00618636
 0.00748158 0.00965985 0.00617267 0.00859926]
R^2 scores for Nikkei 225: [0.99789078 0.9990107  0.997697   0.99822246 0.99819017 0.99863725
 0.9979525  0.99675822 0.99885157 0.99689881]
