In [122]:
import math
import os
import random
import numpy as np
import pandas as pd
import yfinance as yf

# File name to store the data locally
data_filename = "prices.csv"

# Define the 5 assets (tickers) to include in the portfolio
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA"]
# Check if the data file exists
if os.path.exists(data_filename):
    print("Loading data from file...")
    prices_df = pd.read_csv(data_filename, index_col=0, parse_dates=True)
else:
    print("File not found. Downloading data from yfinance...")
    # Download data from yfinance
    df = yf.download(tickers, start="2019-01-01", end="2023-12-31", interval="1d")
    # Extract the 'Close' prices from the MultiIndex DataFrame
    prices_df = df.xs('Close', axis=1, level='Price')
    # Drop rows with missing data and save to file for future use
    prices_df.dropna(inplace=True)
    prices_df.to_csv(data_filename)
    print("Data downloaded and saved to", data_filename)

print(prices_df.head())

# Split into training (first 4 years) and testing (last 1 year)
train_df = prices_df[prices_df.index < "2023-01-01"]
test_df  = prices_df[prices_df.index >= "2023-01-01"]

# Convert price DataFrames to numpy arrays for faster calculations
train_prices = train_df.values  # shape: [train_days, 5]
test_prices  = test_df.values   # shape: [test_days, 5]
num_assets = train_prices.shape[1]
print(f"Training days: {train_prices.shape[0]}, Testing days: {test_prices.shape[0]}")

Loading data from file...
                 AAPL       AMZN      GOOGL       MSFT       TSLA
Date                                                             
2019-01-02  37.667183  76.956497  52.483086  95.119820  20.674667
2019-01-03  33.915260  75.014000  51.029533  91.620537  20.024000
2019-01-04  35.363064  78.769501  53.647011  95.881767  21.179333
2019-01-07  35.284367  81.475502  53.540031  96.004059  22.330667
2019-01-08  35.956989  82.829002  54.010281  96.700127  22.356667
Training days: 1008, Testing days: 250


In [123]:
# Mapping functions between allocation percentages and letter codes
def encode_state(increments):
    """
    Convert a tuple of 5 integer increments (each 0-20) to a 5-letter state string.
    Each increment represents 5% allocation units.
    """
    return "".join(chr(ord('A') + inc) for inc in increments)

def decode_state(state_str):
    """Convert a 5-letter state string back to a tuple of 5 increment values (0-20 each)."""
    return tuple(ord(ch) - ord('A') for ch in state_str)

# Define all possible actions (including no-action)
actions = [(i, j) for i in range(num_assets) for j in range(num_assets) if i != j]
actions.append((None, None))  # no rebalance action

def get_valid_actions(state):
    """
    Given a state (5-letter string or tuple of increments), return a list of valid actions.
    An action (i,j) is valid if asset i has at least 5% to give (increment >=1) 
    and asset j has at most 95% (increment <=19) to receive.
    The no-action (None,None) is always valid.
    """
    # If state is given as a string, decode to increments
    increments = decode_state(state) if isinstance(state, str) else state
    valid = []
    for (i, j) in actions:
        if i is None and j is None:
            # No-action is always allowed
            valid.append((None, None))
            continue
        if i is None or j is None:
            # Ignore half-specified actions (only (None,None) is used for no-action)
            continue
        if increments[i] >= 1 and increments[j] <= 19:
            valid.append((i, j))
    return valid

# Example: initial equal-weight state and its valid actions count
initial_state = (4, 4, 4, 4, 4)  # 4*5% = 20% each, corresponds to 'EEEEE'
print("Initial state:", encode_state(initial_state))
print("Number of valid actions from initial state:", len(get_valid_actions(initial_state)))

Initial state: EEEEE
Number of valid actions from initial state: 21


In [124]:
def compute_reward(weight_frac, price_today, price_next):
    """
    Compute the log return of the portfolio given weight fractions and asset prices.
    weight_frac: list of 5 weight fractions (sums to 1) at day t after rebalancing.
    price_today: prices of the 5 assets at day t.
    price_next: prices of the 5 assets at day t+1.
    Returns: log(portfolio_return) from t to t+1.
    """
    # Portfolio value growth factor = sum_k w_k * (price_next_k / price_today_k)
    growth_factor = 0.0
    for k in range(num_assets):
        growth_factor += weight_frac[k] * (price_next[k] / price_today[k])
    # Reward is log of the growth factor
    return math.log(growth_factor)

def apply_action(state, action):
    """
    Apply a rebalancing action to a state (5% transfer from one asset to another).
    state: current state as a tuple of increments (summing to 20).
    action: tuple (i,j) meaning transfer 5% from asset i to asset j, or (None,None) for no action.
    Returns the new state (tuple of increments) after the action.
    """
    increments = list(state)
    if action is None or action == (None, None):
        # No change in allocation
        return tuple(increments)
    i, j = action
    # Reduce 5% (one increment) from asset i and add 5% to asset j
    if increments[i] >= 1 and increments[j] <= 19:
        increments[i] -= 1
        increments[j] += 1
    return tuple(increments)

In [125]:
# Initialize Q-table as an empty dictionary
Q = {}

def select_action(state, Q, epsilon):
    """
    Select an action for the given state using epsilon-greedy policy.
    """
    valid_actions = get_valid_actions(state)
    # Exploration: choose a random valid action with probability epsilon
    if random.random() < epsilon:
        return random.choice(valid_actions)
    # Exploitation: choose the action with highest Q-value (ties broken arbitrarily)
    state_key = encode_state(state)
    # Initialize Q entries for this state if not seen before
    if state_key not in Q:
        Q[state_key] = {a: 0.0 for a in valid_actions}
    # Select the action with max Q-value
    best_action = max(Q[state_key], key=Q[state_key].get)
    return best_action

def update_Q(Q, state, action, reward, next_state, alpha, gamma):
    """
    Update the Q-table for state-action pair (state, action) using the Q-learning update rule.
    """
    state_key = encode_state(state)
    next_state_key = encode_state(next_state)
    
    # Initialize Q entries for current state if not already present
    if state_key not in Q:
        Q[state_key] = {a: 0.0 for a in get_valid_actions(state)}
    
    # Initialize Q entries for next_state if not already
    if next_state_key not in Q:
        Q[next_state_key] = {a: 0.0 for a in get_valid_actions(next_state)}
    # Current Q-value for this state-action
    q_current = Q[state_key].get(action, 0.0)
    # Best possible Q-value for next state (future reward estimate)
    q_next_max = max(Q[next_state_key].values()) if Q[next_state_key] else 0.0

    # Q-learning update
    Q[state_key][action] = q_current + alpha * (reward + gamma * q_next_max - q_current)

In [126]:
# Hyperparameters for Q-learning
alpha = 0.1        # learning rate
gamma = 0.99       # discount factor (close to 1 for long-term rewards)
epsilon = 1.0      # initial exploration rate
epsilon_min = 0.1  # minimum exploration rate
decay_rate = 0.99  # multiplicative decay for epsilon per episode

episodes = 100  # number of training episodes (iterations over the 4-year training period)

# Initial state is equal-weight (20% each) for each episode
initial_state = (4, 4, 4, 4, 4)  # corresponds to 'EEEEE'

train_days = train_prices.shape[0]

for ep in range(episodes):
    state = initial_state
    # Iterate over each day in the training period (except the last day, since we look ahead one day for reward)
    for t in range(train_days - 1):
        # Choose an action (ε-greedy policy)
        action = select_action(state, Q, epsilon)
        # Apply the action to get new portfolio distribution
        new_state = apply_action(state, action)
        # Calculate reward from day t to t+1
        weights_new = [inc/20.0 for inc in new_state]  # convert increments to fractions
        reward = compute_reward(weights_new, train_prices[t], train_prices[t+1])
        # Update Q-table based on the action and received reward
        update_Q(Q, state, action, reward, new_state, alpha, gamma)
        # Move to next state
        state = new_state
    # Decay exploration rate after each episode (to gradually reduce random exploration)
    epsilon = max(epsilon * decay_rate, epsilon_min)

print("Training completed.")

Training completed.


In [127]:
# Evaluation on the test period (last year)
test_days = test_prices.shape[0]

# Initialize portfolio values and states for agent and baseline
agent_value = 1.0
baseline_value = 1.0
state = initial_state  # agent's state (start equal-weight)
# Baseline: determine initial shares for each asset with equal weights
baseline_weights = [0.2] * num_assets  # 20% in each asset
# If initial portfolio value = 1.0, money allocated to each asset = 0.2
# Number of shares of each asset the baseline holds initially:
baseline_shares = [baseline_weights[i] * baseline_value / test_prices[0][i] for i in range(num_assets)]

# Simulate day by day
for t in range(test_days - 1):
    # Agent chooses the best action (no exploration in test)
    state_key = encode_state(state)
    if state_key in Q:
        # Choose action with highest Q-value for current state
        action = max(Q[state_key], key=Q[state_key].get)
    else:
        action = (None, None)  # if state not seen in training, do nothing
    # Rebalance according to the chosen action
    state = apply_action(state, action)
    # Compute portfolio growth factor for agent from day t to t+1
    weights = [inc/20.0 for inc in state]
    growth_factor = 0.0
    for k in range(num_assets):
        growth_factor += weights[k] * (test_prices[t+1][k] / test_prices[t][k])
    agent_value *= growth_factor
    # Update baseline portfolio value from day t to t+1 (using held shares)
    baseline_value = 0.0
    for k in range(num_assets):
        baseline_value += baseline_shares[k] * test_prices[t+1][k]

# Calculate total returns over the test period
agent_return_pct = (agent_value - 1.0) * 100
baseline_return_pct = (baseline_value - 1.0) * 100

print(f"Baseline final portfolio value: {baseline_value:.4f}  (Return: {baseline_return_pct:.2f}%)")
print(f"Agent final portfolio value:    {agent_value:.4f}  (Return: {agent_return_pct:.2f}%)")

Baseline final portfolio value: 1.7536  (Return: 75.36%)
Agent final portfolio value:    1.7740  (Return: 77.40%)
