**Coursework for 《Advances in Machine Learning》, 2025 Spring.**
***
This notebook demonstrates the implementation of [Trading financial indices with reinforcement learning agents](https://doi.org/10.1016/j.eswa.2018.02.032).

Our github repository is [here](https://github.com/FlyingParachute/MathFin_RL).

**Group members:** Jingtong Xu, Jinyi Lin, Sunqinli Wang, Xingjian Zhao.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os

# 0 Parameter setting (same as the paper)

In [None]:
gamma   = 0.9
lambda_ = 0.9
alpha   = 0.1
epsilon = 0.01
eta     = 0.1

column_names = {
    'ptf1': {
        'quarterly':  {'spx': 'SPXret_1q', 'agg': 'AGGret_1q'},
        'semi_annual':{'spx': 'SPXret_s',  'agg': 'AGGret_s'},
        'annual':     {'spx': 'SPXret_a',  'agg': 'AGGret_a'}
    },
    'ptf3': {
        'quarterly':  {'spx': 'SPXret_1q', 'agg': 'TNXret_1q'},
        'semi_annual':{'spx': 'SPXret_s',  'agg': 'TNXret_s'},
        'annual':     {'spx': 'SPXret_a',  'agg': 'TNXret_a'}
    }
}

# 1 Load and Prepare Data

## 1.1 Load Data

**Portfolios 1**: the combination of SPX and AGG;

**Portfolio 2**: the combination of SPX and T-NOTE 10YR.

In [None]:
def load_ptf1_annual():
    """
    Load SPX and AGG data
    read Portfolio_1.xlsx annual data and set 'Dates' column as index.
    """
    file_path = './data/processed/Portfolio_1.xlsx'
    sheets = ['Quarterly', 'Semi Annually', 'Yearly']
    ptf1 = {sheet: pd.read_excel(file_path, sheet_name=sheet) for sheet in sheets}
    return ptf1['Yearly'].set_index('Dates')

def load_ptf3_annual():
    """
    Load SPX and TNOTE data
    read Portfolio_3.xlsx annual data and set 'Dates' column as index.
    """
    file_path = './data/processed/Portfolio_3.xlsx'
    sheets = ['Quarterly', 'Semi Annually', 'Yearly']
    ptf3 = {sheet: pd.read_excel(file_path, sheet_name=sheet) for sheet in sheets}
    return ptf3['Yearly'].set_index('Dates')

def ensure_dir_exists(directory):
    """ensure_dir_exists, if the directory does not exist, create it."""
    if not os.path.exists(directory):
        os.makedirs(directory)

## 1.2 Define the State Space

In [None]:
def get_state(spx_ret, agg_ret):
    """Convert positive and negative returns of spx and agg to binary states."""
    s = ''
    s += '1' if spx_ret >= 0 else '0'
    s += '1' if agg_ret >= 0 else '0'
    return s

# 2 Action Space

| Action | 1      | 2      | 3      | 4      | 5      |
|--------|--------|--------|--------|--------|--------|
| **S&P 500 (%)** | 0      | 25     | 50     | 75     | 100    |
| **AGG or T-bill bond (%)** | 100    | 75     | 50     | 25     | 0      |

In [None]:

# Discrete action space (stock proportion)
actions = [0.0, 0.25, 0.5, 0.75, 1.0]

def initialize_q():
    """Initialize Q matrix (4 states x 5 actions)."""
    states = ['11', '01', '10', '00']
    return pd.DataFrame(np.random.rand(4, 5), index=states, columns=actions)

# 3 Train the Agent

## 3.1 Define the Agent

In [None]:
class SarsaLambdaAgent:
    """
    SARSA(λ) discrete action agent.
    reward_type='return' or 'sharpe'.
    """
    def __init__(self, reward_type='return'):
        self.q = initialize_q()
        self.e = pd.DataFrame(np.zeros((4, 5)), index=self.q.index, columns=self.q.columns)
        self.reward_type = reward_type
        self.A = 0  # First moment for differential Sharpe ratio
        self.B = 0  # Second moment for differential Sharpe ratio

    def get_reward(self, spx_ret, agg_ret, action):
        portfolio_ret = action * spx_ret + (1 - action) * agg_ret
        
        if self.reward_type == 'return':
            return portfolio_ret
        else:  # sharpe
            # 使用旧的A、B计算差分夏普比率
            old_A = self.A
            old_B = self.B
            denominator = (old_B - old_A**2)**1.5
            
            if denominator == 0:
                dsr = 0
            else:
                dsr = (old_B * (portfolio_ret - old_A) - 0.5 * old_A * (portfolio_ret**2 - old_B)) / denominator
            
            # 更新A、B
            self.A = old_A + eta * (portfolio_ret - old_A)
            self.B = old_B + eta * (portfolio_ret**2 - old_B)
            return dsr

    def update(self, state, action, reward, next_state, next_action):
        # Calculate TD error
        delta = reward + gamma * self.q.loc[next_state, next_action] - self.q.loc[state, action]
        
        # Replace trace update - set current state-action pair to 1
        self.e.loc[state, action] = 1
        
        # Update Q values
        for s in self.q.index:
            for a in self.q.columns:
                self.q.loc[s, a] += alpha * delta * self.e.loc[s, a]
        
        # Decay all eligibility traces
        self.e = gamma * lambda_ * self.e

    def choose_action(self, state):
        if np.random.rand() < epsilon:
            return np.random.choice(actions)
        else:
            return self.q.loc[state].idxmax()

class QLambdaAgent(SarsaLambdaAgent):
    """
    Q(λ) discrete action agent.
    reward_type='return' or 'sharpe'.
    """
    def __init__(self, reward_type='return'):
        super().__init__(reward_type)

    def update(self, state, action, reward, next_state):
        # Find the action with maximum Q value for the next state
        a_star = self.q.loc[next_state].idxmax()
        
        # Calculate TD error
        delta = reward + gamma * self.q.loc[next_state, a_star] - self.q.loc[state, action]
        
        # Set eligibility trace for the current state-action pair to 1
        self.e.loc[state, action] = 1
        
        # Update Q values
        for s in self.q.index:
            for a in self.q.columns:
                self.q.loc[s, a] += alpha * delta * self.e.loc[s, a]
        
        # Update eligibility traces based on greedy action
        # If the next action is not greedy, zero all eligibility traces
        next_action = self.choose_action(next_state)
        if next_action != a_star:
            self.e = 0 * self.e
        else:
            self.e = gamma * lambda_ * self.e

class TDContinuousAgent:
    """
    TD(λ) continuous action agent (two assets).
    """
    def __init__(self):
        # Each state has theta=[theta1, theta2], theta1 in [0,1] represents stock proportion
        states = ['11', '01', '10', '00']
        self.theta = {s: [np.random.uniform(0, 1), 0] for s in states}
        self.e_trace = {s: [0, 0] for s in states}

    def get_value(self, state, spx_ret, agg_ret):
        """Calculate the value function for the current state."""
        # V(s) = θ₁ᴱ(R_t^S - R_t^B) + θ₂ᴱ
        return self.theta[state][0] * (spx_ret - agg_ret) + self.theta[state][1]

    def get_allocation(self, state):
        """Return stock allocation proportion based on current state."""
        if np.random.rand() < epsilon:
            # Exploration: return a random value between [0,1]
            return np.random.uniform(0, 1)
        else:
            # Exploitation: return the θ₁ value for the current state
            return np.clip(self.theta[state][0], 0, 1)

    def update(self, state, spx_ret, agg_ret, reward, next_state):
        # Calculate value functions for current and next states
        current_value = self.get_value(state, spx_ret, agg_ret)
        next_value = self.get_value(next_state, spx_ret, agg_ret)
        
        # Calculate TD error
        delta = reward + gamma * next_value - current_value
        
        # Update eligibility trace: e = γλe + ∇θV(s)
        gradient = [spx_ret - agg_ret, 1]  # ∇θV(s) = (R_t^S - R_t^B, 1)^T
        for i in range(2):
            self.e_trace[state][i] = gamma * lambda_ * self.e_trace[state][i] + gradient[i]
        
        # Update parameters: θ = θ + αδe
        for i in range(2):
            self.theta[state][i] += alpha * delta * self.e_trace[state][i]
        
        # Constrain θ₁ to the [0,1] interval
        self.theta[state][0] = np.clip(self.theta[state][0], 0, 1)


## 3.2 Backtest Function

In [None]:
def backtest(data, train_end_date, test_end_date,
             agent_type='sarsa', reward_type='return',
             freq='annual', portfolio='ptf1'):
    """
    Static Knowledge Agents (SKAs).
    """
    spx_col = column_names[portfolio][freq]['spx']
    agg_col = column_names[portfolio][freq]['agg']

    if isinstance(train_end_date, str):
        train_end_date = pd.to_datetime(train_end_date)
    if isinstance(test_end_date, str):
        test_end_date = pd.to_datetime(test_end_date)

    train_data = data[data.index <= train_end_date]
    test_data = data[(data.index > train_end_date) & (data.index <= test_end_date)]

    # Initialize agent
    if agent_type == 'continuous':
        agent = TDContinuousAgent()
    elif agent_type == 'sarsa':
        agent = SarsaLambdaAgent(reward_type=reward_type)
    elif agent_type == 'qlearning':
        agent = QLambdaAgent(reward_type=reward_type)

    # Training phase: multiple episodes with randomized initial states
    num_episodes = 50
    min_ep_len = 4
    for _ in range(num_episodes):
        # Randomly select starting point
        start_idx = np.random.randint(0, max(1, len(train_data) - min_ep_len))
        
        # Reset eligibility traces
        if agent_type == 'continuous':
            agent.e_trace = {s: [0, 0] for s in agent.e_trace}
        else:
            agent.e = pd.DataFrame(np.zeros((4, 5)), index=agent.q.index, columns=agent.q.columns)
            
        for i in range(start_idx + 1, len(train_data)):
            prev_row = train_data.iloc[i - 1]
            curr_row = train_data.iloc[i]
            state = get_state(prev_row[spx_col], prev_row[agg_col])

            if agent_type == 'continuous':
                action = agent.get_allocation(state)
            else:
                action = agent.choose_action(state)

            # Calculate reward
            if agent_type in ['sarsa', 'qlearning'] and reward_type == 'sharpe':
                reward = agent.get_reward(curr_row[spx_col], curr_row[agg_col], action)
            else:
                portfolio_ret = action * curr_row[spx_col] + (1 - action) * curr_row[agg_col]
                reward = portfolio_ret

            next_state = get_state(curr_row[spx_col], curr_row[agg_col])

            if agent_type == 'sarsa':
                next_action = agent.choose_action(next_state)
                agent.update(state, action, reward, next_state, next_action)
            elif agent_type == 'qlearning':
                agent.update(state, action, reward, next_state)
            elif agent_type == 'continuous':
                agent.update(state, curr_row[spx_col], curr_row[agg_col], reward, next_state)

    # Testing phase
    portfolio_values = [10000]
    current_value = 10000
    dates = [train_end_date]

    for i in range(len(test_data)):
        if i == 0:
            prev_row = train_data.iloc[-1]
        else:
            prev_row = test_data.iloc[i - 1]
        curr_row = test_data.iloc[i]

        state = get_state(prev_row[spx_col], prev_row[agg_col])
        if agent_type == 'continuous':
            action = agent.get_allocation(state)
        else:
            action = agent.choose_action(state)

        ret = action * curr_row[spx_col] + (1 - action) * curr_row[agg_col]
        current_value *= (1 + ret)
        portfolio_values.append(current_value)
        dates.append(test_data.index[i])

    return portfolio_values, dates

def backtest_AKA(data, train_end_date, test_end_date,
                 agent_type='sarsa', reward_type='return',
                 freq='annual', portfolio='ptf1'):
    """
    Adaptive Knowledge Agents (AKAs).
    """
    spx_col = column_names[portfolio][freq]['spx']
    agg_col = column_names[portfolio][freq]['agg']

    full_data = pd.concat([
        data[data.index <= train_end_date],
        data[(data.index > train_end_date) & (data.index <= test_end_date)]
    ])

    portfolio_values = [10000]
    dates = [train_end_date]

    for current_date in full_data[full_data.index > train_end_date].index:
        current_train_data = full_data[full_data.index < current_date]

        # Reinitialize agent each time
        if agent_type == 'continuous':
            agent = TDContinuousAgent()
        elif agent_type == 'sarsa':
            agent = SarsaLambdaAgent(reward_type=reward_type)
        elif agent_type == 'qlearning':
            agent = QLambdaAgent(reward_type=reward_type)

        # Training
        num_episodes = 50  # Increase number of iterations
        min_ep_len = 4
        if len(current_train_data) > min_ep_len:
            for _ in range(num_episodes):
                start_idx = np.random.randint(0, len(current_train_data) - min_ep_len)
                
                # Reset eligibility traces
                if agent_type == 'continuous':
                    agent.e_trace = {s: [0, 0] for s in agent.e_trace}
                else:
                    agent.e = pd.DataFrame(np.zeros((4, 5)), index=agent.q.index, columns=agent.q.columns)
                    
                for i in range(start_idx + 1, len(current_train_data)):
                    prev_row = current_train_data.iloc[i - 1]
                    curr_row = current_train_data.iloc[i]
                    state = get_state(prev_row[spx_col], prev_row[agg_col])

                    if agent_type == 'continuous':
                        action = agent.get_allocation(state)
                    else:
                        action = agent.choose_action(state)

                    if agent_type in ['sarsa', 'qlearning'] and reward_type == 'sharpe':
                        reward = agent.get_reward(curr_row[spx_col], curr_row[agg_col], action)
                    else:
                        portfolio_ret = action * curr_row[spx_col] + (1 - action) * curr_row[agg_col]
                        reward = portfolio_ret

                    next_state = get_state(curr_row[spx_col], curr_row[agg_col])

                    if agent_type == 'sarsa':
                        next_action = agent.choose_action(next_state)
                        agent.update(state, action, reward, next_state, next_action)
                    elif agent_type == 'qlearning':
                        agent.update(state, action, reward, next_state)
                    elif agent_type == 'continuous':
                        agent.update(state, curr_row[spx_col], curr_row[agg_col], reward, next_state)

        # Test current point
        prev_row = current_train_data.iloc[-1] if len(current_train_data) > 0 else full_data.iloc[0]
        current_row = full_data.loc[current_date]
        state = get_state(prev_row[spx_col], prev_row[agg_col])

        if agent_type == 'continuous':
            action = agent.get_allocation(state)
        else:
            action = agent.choose_action(state)

        ret = action * current_row[spx_col] + (1 - action) * current_row[agg_col]
        portfolio_values.append(portfolio_values[-1] * (1 + ret))
        dates.append(current_date)

    return portfolio_values, dates


## 3.3 Function for visualizing the backtest results

In [None]:
def calculate_benchmarks(data, train_end_date, test_end_date,
                         freq='annual', portfolio='ptf1'):
    spx_col = column_names[portfolio][freq]['spx']
    agg_col = column_names[portfolio][freq]['agg']

    test_data = data[(data.index > train_end_date) & (data.index <= test_end_date)]

    allocations = {
        'A2': 0.25,
        'A3': 0.5,
        'A4': 0.75,
        'Bonds': 0.0,
        'Stocks': 1.0
    }
    # Ceiling strategy
    benchmarks = {k: [10000] for k in allocations}
    benchmarks['Ceiling'] = [10000]

    for i in range(len(test_data)):
        spx_ret = test_data.iloc[i][spx_col]
        agg_ret = test_data.iloc[i][agg_col]
        for strategy, alloc in allocations.items():
            ret = alloc * spx_ret + (1 - alloc) * agg_ret
            benchmarks[strategy].append(benchmarks[strategy][-1] * (1 + ret))
        best_ret = max(spx_ret, agg_ret)
        benchmarks['Ceiling'].append(benchmarks['Ceiling'][-1] * (1 + best_ret))
    return benchmarks

def annualized_returns(final_val, years):
    """
    final_val: Final portfolio value
    years:     Number of investment years
    Returns total return and annualized return
    """
    total_ret = (final_val / 10000 - 1) * 100
    ann_ret   = ((1 + total_ret/100)**(1/years) - 1) * 100
    return total_ret, ann_ret

# ================
#  Plot Fig.4 & Fig.5
# ================
def plot_fig4_5(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq='annual', portfolio='ptf1', fig_title_prefix=''):
    """
    Fig.4: On-policy(SARSA) & Continuous
    Fig.5: Off-policy(Q-learning)
    Training period: [train_start, train_end]
    Testing period: (train_end, test_end]
    """
    train_start = pd.to_datetime(train_start_str)
    train_end   = pd.to_datetime(train_end_str)
    test_start  = pd.to_datetime(test_start_str)
    test_end    = pd.to_datetime(test_end_str)

    # Extract data for this period
    full_data = data[(data.index >= train_start) & (data.index <= test_end)]
    # Will filter again in the backtest below
    # Here only ensures data is within bounds

    # Benchmarks
    benchmarks = calculate_benchmarks(full_data, train_end, test_end, freq, portfolio)

    # ========== Fig.4: on-policy & continuous ==========
    # SKA(R-SKA)
    ska_vals, ska_dates = backtest(full_data, train_end, test_end,
                                   agent_type='sarsa', reward_type='return',
                                   freq=freq, portfolio=portfolio)
    # AKA(R-AKA)
    aka_vals, aka_dates = backtest_AKA(full_data, train_end, test_end,
                                       agent_type='sarsa', reward_type='return',
                                       freq=freq, portfolio=portfolio)
    # S-SKA
    s_ska_vals, _ = backtest(full_data, train_end, test_end,
                             agent_type='sarsa', reward_type='sharpe',
                             freq=freq, portfolio=portfolio)
    # S-AKA
    s_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                 agent_type='sarsa', reward_type='sharpe',
                                 freq=freq, portfolio=portfolio)
    # CA-SKA
    ca_ska_vals, _ = backtest(full_data, train_end, test_end,
                              agent_type='continuous', reward_type='return',
                              freq=freq, portfolio=portfolio)
    # CA-AKA
    ca_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                  agent_type='continuous', reward_type='return',
                                  freq=freq, portfolio=portfolio)

    fig4 = plt.figure(figsize=(10, 6))
    plt.plot(ska_dates, ska_vals,   label='SKA (R-SKA)', linewidth=2)
    plt.plot(aka_dates, aka_vals,   label='AKA (R-AKA)', linewidth=2)
    plt.plot(ska_dates, s_ska_vals, label='S-SKA',       linewidth=2)
    plt.plot(ska_dates, s_aka_vals, label='S-AKA',       linewidth=2)
    plt.plot(ska_dates, ca_ska_vals,label='CA-SKA',      linewidth=2)
    plt.plot(ska_dates, ca_aka_vals,label='CA-AKA',      linewidth=2)
    plt.plot(ska_dates, benchmarks['Bonds'],  label='AGG Bonds', linestyle='--')
    plt.plot(ska_dates, benchmarks['Stocks'], label='S&P 500',   linestyle='--')
    plt.plot(ska_dates, benchmarks['Ceiling'],label='Ceiling',   linestyle='-.')
    # Remove logarithmic scale
    # plt.yscale('log')
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper left')
    plt.title(f'{fig_title_prefix}Portfolio Performance: On-policy & Continuous Agents')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gcf().autofmt_xdate()
    plt.ylabel('Portfolio Value ($)')
    plt.xlabel('Year')
    plt.tight_layout()

    # ========== Fig.5: off-policy ==========
    # Q-SKA
    q_ska_vals, q_ska_dates = backtest(full_data, train_end, test_end,
                                       agent_type='qlearning', reward_type='return',
                                       freq=freq, portfolio=portfolio)
    # Q-AKA
    q_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                 agent_type='qlearning', reward_type='return',
                                 freq=freq, portfolio=portfolio)
    # QS-SKA
    qs_ska_vals, _ = backtest(full_data, train_end, test_end,
                              agent_type='qlearning', reward_type='sharpe',
                              freq=freq, portfolio=portfolio)
    # QS-AKA
    qs_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                  agent_type='qlearning', reward_type='sharpe',
                                  freq=freq, portfolio=portfolio)

    fig5 = plt.figure(figsize=(10, 6))
    plt.plot(q_ska_dates, benchmarks['Bonds'],  label='Bonds',  linestyle='--')
    plt.plot(q_ska_dates, benchmarks['Stocks'], label='Stocks', linestyle='--')
    plt.plot(q_ska_dates, q_ska_vals,   label='Q-SKA',   linewidth=2)
    plt.plot(q_ska_dates, q_aka_vals,   label='Q-AKA',   linewidth=2)
    plt.plot(q_ska_dates, qs_ska_vals,  label='QS-SKA',  linewidth=2)
    plt.plot(q_ska_dates, qs_aka_vals,  label='QS-AKA',  linewidth=2)
    plt.plot(q_ska_dates, benchmarks['A2'], label='A2', linestyle=':')
    plt.plot(q_ska_dates, benchmarks['A3'], label='A3', linestyle=':')
    plt.plot(q_ska_dates, benchmarks['A4'], label='A4', linestyle=':')
    # Remove logarithmic scale
    # plt.yscale('log')
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper left')
    plt.title(f'{fig_title_prefix}Portfolio Performance: Off-policy Agents')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gcf().autofmt_xdate()
    plt.ylabel('Portfolio Value ($)')
    plt.xlabel('Year')
    plt.tight_layout()
    
    return fig4, fig5

def plot_fig6_7(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq='annual', portfolio='ptf1', fig_title_prefix=''):
    """
    Fig.6: On-policy & Continuous (second training/testing period)
    Fig.7: Off-policy (second training/testing period)
    """
    # Same logic as fig4_5, just with a different training/testing period
    fig6, fig7 = plot_fig4_5(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq, portfolio, fig_title_prefix='Portfolio Performance: Second Period - ')
    
    # Rename charts
    plt.figure(fig6.number)
    plt.title(f'{fig_title_prefix}Portfolio Performance: On-policy & Continuous Agents (Second Period)')
    
    plt.figure(fig7.number)
    plt.title(f'{fig_title_prefix}Portfolio Performance: Off-policy Agents (Second Period)')
    
    return fig6, fig7


# ================
#  Plot Fig.8 & Fig.9
# ================
def plot_fig8_9(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq='annual', portfolio='ptf3', fig_title_prefix=''):
    """
    Fig.8: On-policy(SARSA) & Continuous
    Fig.9: Off-policy(Q-learning)
    Training period: [train_start, train_end]
    Testing period: (train_end, test_end]
    """
    train_start = pd.to_datetime(train_start_str)
    train_end   = pd.to_datetime(train_end_str)
    test_start  = pd.to_datetime(test_start_str)
    test_end    = pd.to_datetime(test_end_str)

    # Extract data for this period
    full_data = data[(data.index >= train_start) & (data.index <= test_end)]
    # Will filter again in the backtest below
    # Here only ensures data is within bounds

    # Benchmarks
    benchmarks = calculate_benchmarks(full_data, train_end, test_end, freq, portfolio)

    # ========== Fig.8: on-policy & continuous ==========
    # SKA(R-SKA)
    ska_vals, ska_dates = backtest(full_data, train_end, test_end,
                                   agent_type='sarsa', reward_type='return',
                                   freq=freq, portfolio=portfolio)
    # AKA(R-AKA)
    aka_vals, aka_dates = backtest_AKA(full_data, train_end, test_end,
                                       agent_type='sarsa', reward_type='return',
                                       freq=freq, portfolio=portfolio)
    # S-SKA
    s_ska_vals, _ = backtest(full_data, train_end, test_end,
                             agent_type='sarsa', reward_type='sharpe',
                             freq=freq, portfolio=portfolio)
    # S-AKA
    s_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                 agent_type='sarsa', reward_type='sharpe',
                                 freq=freq, portfolio=portfolio)
    # CA-SKA
    ca_ska_vals, _ = backtest(full_data, train_end, test_end,
                              agent_type='continuous', reward_type='return',
                              freq=freq, portfolio=portfolio)
    # CA-AKA
    ca_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                  agent_type='continuous', reward_type='return',
                                  freq=freq, portfolio=portfolio)

    fig8 = plt.figure(figsize=(10, 6))
    plt.plot(ska_dates, ska_vals,   label='SKA (R-SKA)', linewidth=2)
    plt.plot(aka_dates, aka_vals,   label='AKA (R-AKA)', linewidth=2)
    plt.plot(ska_dates, s_ska_vals, label='S-SKA',       linewidth=2)
    plt.plot(ska_dates, s_aka_vals, label='S-AKA',       linewidth=2)
    plt.plot(ska_dates, ca_ska_vals,label='CA-SKA',      linewidth=2)
    plt.plot(ska_dates, ca_aka_vals,label='CA-AKA',      linewidth=2)
    plt.plot(ska_dates, benchmarks['Bonds'],  label='T-NOTE', linestyle='--')
    plt.plot(ska_dates, benchmarks['Stocks'], label='S&P 500',   linestyle='--')
    plt.plot(ska_dates, benchmarks['Ceiling'],label='Ceiling',   linestyle='-.')
    # Remove logarithmic scale
    # plt.yscale('log')
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper left')
    plt.title(f'{fig_title_prefix}Portfolio Performance: On-policy & Continuous Agents')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gcf().autofmt_xdate()
    plt.ylabel('Portfolio Value ($)')
    plt.xlabel('Year')
    plt.tight_layout()

    # ========== Fig.9: off-policy ==========
    # Q-SKA
    q_ska_vals, q_ska_dates = backtest(full_data, train_end, test_end,
                                       agent_type='qlearning', reward_type='return',
                                       freq=freq, portfolio=portfolio)
    # Q-AKA
    q_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                 agent_type='qlearning', reward_type='return',
                                 freq=freq, portfolio=portfolio)
    # QS-SKA
    qs_ska_vals, _ = backtest(full_data, train_end, test_end,
                              agent_type='qlearning', reward_type='sharpe',
                              freq=freq, portfolio=portfolio)
    # QS-AKA
    qs_aka_vals, _ = backtest_AKA(full_data, train_end, test_end,
                                  agent_type='qlearning', reward_type='sharpe',
                                  freq=freq, portfolio=portfolio)

    fig9 = plt.figure(figsize=(10, 6))
    plt.plot(q_ska_dates, benchmarks['Bonds'],  label='T-NOTE',  linestyle='--')
    plt.plot(q_ska_dates, benchmarks['Stocks'], label='Stocks', linestyle='--')
    plt.plot(q_ska_dates, q_ska_vals,   label='Q-SKA',   linewidth=2)
    plt.plot(q_ska_dates, q_aka_vals,   label='Q-AKA',   linewidth=2)
    plt.plot(q_ska_dates, qs_ska_vals,  label='QS-SKA',  linewidth=2)
    plt.plot(q_ska_dates, qs_aka_vals,  label='QS-AKA',  linewidth=2)
    plt.plot(q_ska_dates, benchmarks['A2'], label='A2', linestyle=':')
    plt.plot(q_ska_dates, benchmarks['A3'], label='A3', linestyle=':')
    plt.plot(q_ska_dates, benchmarks['A4'], label='A4', linestyle=':')
    # Remove logarithmic scale
    # plt.yscale('log')
    plt.grid(True, alpha=0.3)
    plt.legend(loc='upper left')
    plt.title(f'{fig_title_prefix}Portfolio Performance: Off-policy Agents')
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    plt.gcf().autofmt_xdate()
    plt.ylabel('Portfolio Value ($)')
    plt.xlabel('Year')
    plt.tight_layout()
    
    return fig8, fig9

def plot_fig10_11(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq='annual', portfolio='ptf3', fig_title_prefix=''):
    """
    Fig.10: On-policy & Continuous (second training/testing period)
    Fig.11: Off-policy (second training/testing period)
    """
    # Same logic as fig8_9, just with a different training/testing period
    fig10, fig11 = plot_fig8_9(data, train_start_str, train_end_str, test_start_str, test_end_str,
                freq, portfolio, fig_title_prefix='Portfolio Performance: Second Period - ')
    
    # Rename charts
    plt.figure(fig10.number)
    plt.title(f'{fig_title_prefix}Portfolio Performance: On-policy & Continuous Agents (Second Period)')
    
    plt.figure(fig11.number)
    plt.title(f'{fig_title_prefix}Portfolio Performance: Off-policy Agents (Second Period)')
    
    return fig10, fig11


# 4 Results