In [1]:
import yfinance as yf
import pandas as pd


sp500 = yf.download("^GSPC", start="2006-01-01", end="2025-04-10")

sp500 = sp500[['Open', 'High', 'Low', 'Close', 'Volume']]

sp500.columns = sp500.columns.get_level_values(0)


print(sp500)





YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price              Open         High          Low        Close      Volume
Date                                                                      
2006-01-03  1248.290039  1270.219971  1245.739990  1268.800049  2554570000
2006-01-04  1268.800049  1275.369995  1267.739990  1273.459961  2515330000
2006-01-05  1273.459961  1276.910034  1270.300049  1273.479980  2433340000
2006-01-06  1273.479980  1286.089966  1273.479980  1285.449951  2446560000
2006-01-09  1285.449951  1290.780029  1284.819946  1290.150024  2301490000
...                 ...          ...          ...          ...         ...
2025-04-03  5492.740234  5499.529785  5390.830078  5396.520020  7210470000
2025-04-04  5292.140137  5292.140137  5069.899902  5074.080078  8853500000
2025-04-07  4953.790039  5246.569824  4835.040039  5062.250000  8691980000
2025-04-08  5193.569824  5267.470215  4910.419922  4982.770020  7408140000
2025-04-09  4965.279785  5481.339844  4948.430176  5456.899902  9489600000

[4848 rows x 5 columns]





In [2]:
pip install ta


Note: you may need to restart the kernel to use updated packages.


In [3]:
import ta  
from ta.trend import MACD

def add_technical_indicators(df):
    required_cols = {'Close', 'High', 'Low'}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        raise ValueError(f"Missing required columns: {missing}")

    
    macd_indicator = MACD(close=df['Close'])
    df['MACD'] = macd_indicator.macd()
    # df['MACD_signal'] = macd_indicator.macd_signal()
    # df['MACD_diff'] = macd_indicator.macd_diff()

    # RSI
    df['RSI'] = ta.momentum.rsi(df['Close'])

    # ATR
    df['ATR'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'])

    return df


In [4]:
sp500 = add_technical_indicators(sp500)
print(sp500)


Price              Open         High          Low        Close      Volume  \
Date                                                                         
2006-01-03  1248.290039  1270.219971  1245.739990  1268.800049  2554570000   
2006-01-04  1268.800049  1275.369995  1267.739990  1273.459961  2515330000   
2006-01-05  1273.459961  1276.910034  1270.300049  1273.479980  2433340000   
2006-01-06  1273.479980  1286.089966  1273.479980  1285.449951  2446560000   
2006-01-09  1285.449951  1290.780029  1284.819946  1290.150024  2301490000   
...                 ...          ...          ...          ...         ...   
2025-04-03  5492.740234  5499.529785  5390.830078  5396.520020  7210470000   
2025-04-04  5292.140137  5292.140137  5069.899902  5074.080078  8853500000   
2025-04-07  4953.790039  5246.569824  4835.040039  5062.250000  8691980000   
2025-04-08  5193.569824  5267.470215  4910.419922  4982.770020  7408140000   
2025-04-09  4965.279785  5481.339844  4948.430176  5456.899902  

In [5]:
!pip install pandas_datareader




In [6]:
import yfinance as yf
import pandas_datareader.data as web

def add_macro_indicators(df, start="2006-01-01", end="2024-03-31"):
    import yfinance as yf
    import pandas_datareader.data as web

    vix = yf.download("^VIX", start=start, end=end)[['Close']].rename(columns={"Close": "VIX"})
    usdx = yf.download("DX-Y.NYB", start=start, end=end)[['Close']].rename(columns={"Close": "USDX"})

    effr = web.DataReader("EFFR", "fred", start, end)
    unrate = web.DataReader("UNRATE", "fred", start, end)
    umcsent = web.DataReader("UMCSENT", "fred", start, end)

    macro = vix.join([usdx, effr, unrate, umcsent])
    macro = macro.fillna(method='ffill')

    for col in macro.columns:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    df = df.join(macro)
    return df


In [7]:
sp500 = add_macro_indicators(sp500)
print(sp500)



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


                   Open         High          Low        Close      Volume  \
Date                                                                         
2006-01-03  1248.290039  1270.219971  1245.739990  1268.800049  2554570000   
2006-01-04  1268.800049  1275.369995  1267.739990  1273.459961  2515330000   
2006-01-05  1273.459961  1276.910034  1270.300049  1273.479980  2433340000   
2006-01-06  1273.479980  1286.089966  1273.479980  1285.449951  2446560000   
2006-01-09  1285.449951  1290.780029  1284.819946  1290.150024  2301490000   
...                 ...          ...          ...          ...         ...   
2025-04-03  5492.740234  5499.529785  5390.830078  5396.520020  7210470000   
2025-04-04  5292.140137  5292.140137  5069.899902  5074.080078  8853500000   
2025-04-07  4953.790039  5246.569824  4835.040039  5062.250000  8691980000   
2025-04-08  5193.569824  5267.470215  4910.419922  4982.770020  7408140000   
2025-04-09  4965.279785  5481.339844  4948.430176  5456.899902  

In [8]:
def make_soft_labels(y_tensor, num_classes, sigma=0.7):
    
    soft_labels = []
    for label in y_tensor:
        distances = torch.arange(num_classes).float().to(label.device) - label.float()
        weights = torch.exp(-0.5 * (distances / sigma) ** 2)
        weights /= weights.sum()
        soft_labels.append(weights)
    return torch.stack(soft_labels)


In [9]:
import numpy as np
from sklearn.preprocessing import StandardScaler


sp500.columns = [f'{col[0]}_{col[1]}' if isinstance(col, tuple) else col for col in sp500.columns]


feature_cols = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'MACD', 'RSI', 'ATR',
    'EFFR', 'UNRATE', 'UMCSENT',
    'VIX_^VIX', 'USDX_DX-Y.NYB'
]


sp500['log_return'] = np.log(sp500['Close'] / sp500['Close'].shift(1))


## Classification 9 classes

num_classes = 9


sp500['return_class'], bin_edges = pd.qcut(
    sp500['log_return'], 
    q=num_classes,           
    labels=False,            
    retbins=True,            
    duplicates='drop'        
)



## Cleaning 
def clean_and_prepare_data(df, feature_cols, label_col, window_size=120):
    for col in feature_cols:
        assert col in df.columns, f"col not exist: {col}"
    assert label_col in df.columns, f"label_col not exist: {label_col}"

    df_clean = df.dropna(subset=feature_cols + [label_col]).copy()

    stds = df_clean[feature_cols].std()
    zero_std_cols = stds[stds == 0].index.tolist()
    if zero_std_cols:
        print(f"Remove because of zero {zero_std_cols}")
    feature_cols = [col for col in feature_cols if col not in zero_std_cols]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_clean[feature_cols])

    X, y = [], []
    for i in range(window_size, len(df_clean)):
        X.append(scaled_features[i - window_size:i])
        y.append(df_clean[label_col].iloc[i])

    return np.array(X), np.array(y)

X, y = clean_and_prepare_data(sp500, feature_cols, 'return_class', window_size=120)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (4445, 120, 13)
y shape: (4445,)


In [10]:
import pandas as pd


class_counts = sp500['return_class'].value_counts().sort_index()
print("Number of sample in each class")
print(class_counts)


Number of sample in each class
0.0    539
1.0    538
2.0    539
3.0    538
4.0    539
5.0    538
6.0    539
7.0    538
8.0    539
Name: return_class, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False  
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (3556, 120, 13)
X_test shape: (889, 120, 13)
y_train shape: (3556,)
y_test shape: (889,)


In [12]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


num_classes = 9
batch_size = 64


X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)


X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [13]:
import torch
import torch.nn as nn


class SimpleRNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleRNNClassifier, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = out[:, -1, :]  
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        return out



class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        out, _ = self.lstm(x)  
        out = out[:, -1, :]    
        out = self.relu(self.fc1(out))
        out = self.fc2(out)
        return out
    


class ESNClassifier(nn.Module):
    def __init__(self, input_dim, reservoir_size, num_classes, spectral_radius=0.9, leaking_rate=0.15):
        super().__init__()
        self.reservoir_size = reservoir_size
        self.leaking_rate = leaking_rate
        self.spectral_radius = spectral_radius

        
        self.input_weights = nn.Parameter(
            torch.randn(input_dim, reservoir_size) * 0.3,
            requires_grad=True
        )

        # bias
        self.bias = nn.Parameter(torch.randn(reservoir_size) * 0.01, requires_grad=True)

        # reservoir 
        W = torch.randn(reservoir_size, reservoir_size) / (reservoir_size ** 0.5)
        eigvals = torch.linalg.eigvals(W).abs()
        max_eig = eigvals.max().item()
        W = (W / max_eig) * spectral_radius
        self.reservoir_weights = nn.Parameter(W, requires_grad=False)

        
        self.readout = nn.Sequential(
    nn.Linear(reservoir_size, 64),
    nn.ReLU(),
    nn.Linear(64, num_classes)
)


    def forward(self, x):
        # input
        batch_size, seq_len, _ = x.shape
        device = x.device

        states = torch.zeros(batch_size, self.reservoir_size, device=device)
        all_states = []

        for t in range(seq_len):
            u = x[:, t, :]  
            preact = u @ self.input_weights + states @ self.reservoir_weights + self.bias
            new_state = torch.tanh(preact)
            states = (1 - self.leaking_rate) * states + self.leaking_rate * new_state
            all_states.append(states.unsqueeze(1))  # shape: (batch, 1, reservoir)

        all_states = torch.cat(all_states, dim=1)  # shape: (batch, seq_len, reservoir_size)
        
        pooled = all_states.mean(dim=1)
        return self.readout(pooled)


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
models = {
    "RNN" : SimpleRNNClassifier(input_size=X.shape[2], hidden_size=64, num_classes=num_classes).to(device),
    "LSTM": LSTMClassifier(input_size=X.shape[2], hidden_size=64, num_classes=num_classes).to(device),
    "ESN":   ESNClassifier(input_dim=X.shape[2], reservoir_size=64, num_classes=num_classes, spectral_radius=0.95, leaking_rate=0.1).to(device)    
}



from sklearn.utils.class_weight import compute_class_weight
import torch


class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(num_classes),
    y=y_train  
)


class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)



In [15]:
sp500['return_class'].value_counts().sort_index()

## Bin range 

print("Bin edges:", bin_edges)


for i in range(len(bin_edges) - 1):
    print(f"Class {i}: from {bin_edges[i]:.4f} to {bin_edges[i+1]:.4f}")

    




Bin edges: [-0.1276522  -0.01129563 -0.00519785 -0.00207425 -0.00018688  0.00159394
  0.00374235  0.00658771  0.01135793  0.10957197]
Class 0: from -0.1277 to -0.0113
Class 1: from -0.0113 to -0.0052
Class 2: from -0.0052 to -0.0021
Class 3: from -0.0021 to -0.0002
Class 4: from -0.0002 to 0.0016
Class 5: from 0.0016 to 0.0037
Class 6: from 0.0037 to 0.0066
Class 7: from 0.0066 to 0.0114
Class 8: from 0.0114 to 0.1096


In [16]:
def train_model(model, name, num_epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            predicted = torch.argmax(outputs, dim=1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)

        acc = correct / total * 100
        print(f"[{name}] Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, Accuracy: {acc:.2f}%")


In [17]:
## Train Model

train_model(models["RNN"], "RNN", num_epochs=150)
train_model(models["LSTM"], "LSTM", num_epochs=150)
train_model(models["ESN"], "ESN", num_epochs=150)


