In [1]:
import pandas as pd
import ast
from tqdm import tqdm  # For progress bar
import numpy as np
from sklearn.preprocessing import StandardScaler


def process_large_dataset(df, list_columns):
    """
    Process large datasets with string-encoded lists efficiently
    """
    # Create a new dataframe for expanded columns
    expanded_df = df.drop(columns=list_columns).copy()
    
    # Process each list column
    for col in tqdm(list_columns, desc="Processing columns"):
        # Get the first row to determine the list length
        first_row_list = ast.literal_eval(df[col].iloc[0])
        num_elements = len(first_row_list)
        
        # Pre-allocate numpy arrays for better performance
        expanded_values = np.zeros((len(df), num_elements))
        
        # Process chunks of the dataframe
        chunk_size = 1000
        for start_idx in tqdm(range(0, len(df), chunk_size), desc=f"Processing {col}"):
            end_idx = min(start_idx + chunk_size, len(df))
            chunk = df[col].iloc[start_idx:end_idx]
            
            # Process each row in the chunk
            for i, row in enumerate(chunk):
                try:
                    values = ast.literal_eval(row)
                    expanded_values[start_idx + i] = values
                except (ValueError, SyntaxError) as e:
                    print(f"Error processing row {start_idx + i} in column {col}: {e}")
                    expanded_values[start_idx + i] = np.nan
        
        # Add the expanded columns to the dataframe
        for i in range(num_elements):
            expanded_df[f'{col}_{i+1}'] = expanded_values[:, i]
            
        scaler = StandardScaler()
        normalized_data = scaler.fit_transform(expanded_df)

        # Convert back to DataFrame if needed
        normalized_df = pd.DataFrame(
                        normalized_data, 
                        columns=expanded_df.columns, 
                        index=expanded_df.index
                        )
    return normalized_df


path = "/home/richtsai1103/CRL/src/results/HalfCheetah-v5/ppo_20241212_020732/selected_steps.csv"
df = pd.read_csv(path)
list_columns = ['current_state', 'current_action', 'prev_state', 'prev_action']
expanded_df = process_large_dataset(df, list_columns)
expanded_df

Processing current_state: 100%|██████████| 2/2 [00:00<00:00, 28.11it/s]
Processing current_action: 100%|██████████| 2/2 [00:00<00:00, 68.37it/s]
Processing prev_state: 100%|██████████| 2/2 [00:00<00:00, 31.07it/s]
Processing prev_action: 100%|██████████| 2/2 [00:00<00:00, 67.87it/s]
Processing columns: 100%|██████████| 4/4 [00:00<00:00, 16.26it/s]


Unnamed: 0,global_step,episode,current_reward,done,prev_reward,current_state_1,current_state_2,current_state_3,current_state_4,current_state_5,...,prev_state_14,prev_state_15,prev_state_16,prev_state_17,prev_action_1,prev_action_2,prev_action_3,prev_action_4,prev_action_5,prev_action_6
0,-1.731185,-1.414214,-1.323091,0.0,-0.950471,-0.272576,-1.701001,-1.269941,0.582522,0.529642,...,0.578668,-1.194030,0.184086,1.209419,1.373783,0.020068,0.397226,-0.982029,-0.850026,0.350307
1,-1.729453,-0.707107,-1.037890,0.0,-1.462812,1.326739,1.471474,-0.470064,0.240357,-0.588327,...,-0.841225,0.254198,-0.072943,-1.191217,-0.759454,0.762799,-0.294504,0.181151,0.637417,-0.484872
2,-1.727721,0.000000,-0.591575,0.0,-1.047586,-1.205878,-0.222795,1.032082,-0.936699,-1.515665,...,-0.530839,0.074299,0.291555,1.796288,-1.685172,-0.093831,-0.419836,0.178053,0.884844,0.442093
3,-1.725989,0.707107,-1.155956,0.0,0.432861,1.549789,-0.049213,-0.239077,0.408457,1.725231,...,0.764318,-0.612568,-0.861120,-0.273492,-0.909154,0.289674,0.079208,-0.285230,-0.216809,-0.260399
4,-1.724257,1.414214,-1.972636,0.0,-1.177665,1.595479,-1.748685,1.262667,1.654896,1.595034,...,0.120690,0.567307,0.152689,0.120595,-1.205507,0.430454,-1.441041,-0.023542,0.807343,0.345193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1.724257,-1.414214,-0.702189,0.0,-0.385321,-0.112793,-0.241961,-1.271933,0.789788,0.271121,...,0.309697,-1.059599,-0.465702,-1.631986,1.771411,0.960663,0.157740,-1.858215,-1.959152,-1.300859
1996,1.725989,-0.707107,0.436071,0.0,0.703060,0.297418,0.972961,-1.403345,1.337132,-0.179916,...,0.440128,-1.856293,0.057859,-1.175788,1.683922,0.596628,-0.412273,-0.780045,-1.654445,-1.033115
1997,1.727721,0.000000,0.079706,0.0,-0.390635,-1.291950,0.211660,-1.489609,0.343988,1.146782,...,0.053784,-1.027349,-0.686377,0.131564,2.379497,0.831690,-1.487757,-0.477734,-2.493255,0.348237
1998,1.729453,0.707107,0.862449,0.0,1.788566,-0.064739,-1.421626,-0.796399,1.545395,0.548863,...,0.640707,0.296109,-1.834003,-0.011890,-0.740945,-0.635055,-0.032840,0.616932,0.333908,0.514703


In [2]:
expanded_df = expanded_df.iloc[:,2:]
expanded_df

Unnamed: 0,current_reward,done,prev_reward,current_state_1,current_state_2,current_state_3,current_state_4,current_state_5,current_state_6,current_state_7,...,prev_state_14,prev_state_15,prev_state_16,prev_state_17,prev_action_1,prev_action_2,prev_action_3,prev_action_4,prev_action_5,prev_action_6
0,-1.323091,0.0,-0.950471,-0.272576,-1.701001,-1.269941,0.582522,0.529642,0.389324,-0.445409,...,0.578668,-1.194030,0.184086,1.209419,1.373783,0.020068,0.397226,-0.982029,-0.850026,0.350307
1,-1.037890,0.0,-1.462812,1.326739,1.471474,-0.470064,0.240357,-0.588327,0.316577,-0.606991,...,-0.841225,0.254198,-0.072943,-1.191217,-0.759454,0.762799,-0.294504,0.181151,0.637417,-0.484872
2,-0.591575,0.0,-1.047586,-1.205878,-0.222795,1.032082,-0.936699,-1.515665,-0.338085,-1.099147,...,-0.530839,0.074299,0.291555,1.796288,-1.685172,-0.093831,-0.419836,0.178053,0.884844,0.442093
3,-1.155956,0.0,0.432861,1.549789,-0.049213,-0.239077,0.408457,1.725231,1.540156,-0.175788,...,0.764318,-0.612568,-0.861120,-0.273492,-0.909154,0.289674,0.079208,-0.285230,-0.216809,-0.260399
4,-1.972636,0.0,-1.177665,1.595479,-1.748685,1.262667,1.654896,1.595034,-1.220595,1.633930,...,0.120690,0.567307,0.152689,0.120595,-1.205507,0.430454,-1.441041,-0.023542,0.807343,0.345193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.702189,0.0,-0.385321,-0.112793,-0.241961,-1.271933,0.789788,0.271121,0.994129,-0.222410,...,0.309697,-1.059599,-0.465702,-1.631986,1.771411,0.960663,0.157740,-1.858215,-1.959152,-1.300859
1996,0.436071,0.0,0.703060,0.297418,0.972961,-1.403345,1.337132,-0.179916,1.420869,1.672574,...,0.440128,-1.856293,0.057859,-1.175788,1.683922,0.596628,-0.412273,-0.780045,-1.654445,-1.033115
1997,0.079706,0.0,-0.390635,-1.291950,0.211660,-1.489609,0.343988,1.146782,1.203936,-0.213643,...,0.053784,-1.027349,-0.686377,0.131564,2.379497,0.831690,-1.487757,-0.477734,-2.493255,0.348237
1998,0.862449,0.0,1.788566,-0.064739,-1.421626,-0.796399,1.545395,0.548863,0.813541,-1.131484,...,0.640707,0.296109,-1.834003,-0.011890,-0.740945,-0.635055,-0.032840,0.616932,0.333908,0.514703


In [3]:
expanded_df.describe()

Unnamed: 0,current_reward,done,prev_reward,current_state_1,current_state_2,current_state_3,current_state_4,current_state_5,current_state_6,current_state_7,...,prev_state_14,prev_state_15,prev_state_16,prev_state_17,prev_action_1,prev_action_2,prev_action_3,prev_action_4,prev_action_5,prev_action_6
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,-8.526513e-17,0.0,4.2632560000000003e-17,4.6185280000000004e-17,2.664535e-17,-2.398082e-17,3.5527140000000005e-17,2.4869000000000002e-17,1.7763570000000002e-17,-1.5987210000000002e-17,...,-2.442491e-18,1.7763570000000002e-17,1.9539930000000003e-17,1.865175e-17,2.842171e-17,1.4210850000000002e-17,-2.842171e-17,0.0,2.842171e-17,-7.105427e-18
std,1.00025,0.0,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,...,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025,1.00025
min,-3.15954,0.0,-2.836834,-1.719902,-1.799035,-1.693454,-1.732168,-1.692282,-1.731388,-1.799933,...,-2.776529,-1.902085,-2.329566,-2.151681,-3.027226,-2.340127,-3.08068,-3.175836,-3.645785,-2.852391
25%,-0.7776617,0.0,-0.7647043,-0.8680618,-0.8403434,-0.8914096,-0.8389194,-0.8473128,-0.8782975,-0.8188486,...,-0.71143,-0.699532,-0.5220458,-0.6719594,-0.7029762,-0.8563726,-0.6554367,-0.737379,-0.6399125,-0.6483655
50%,-0.0374725,0.0,-0.03289775,0.009230417,-0.005495064,-0.0008731252,-0.009868278,-0.02039891,0.02600902,0.00137659,...,0.09113916,-0.06860761,-0.1935962,-0.09111079,-0.07080107,-0.1024968,-0.01035166,-0.100201,0.01379701,-0.0473155
75%,0.7100988,0.0,0.7245875,0.8583979,0.8790374,0.8799081,0.8843019,0.8566383,0.8706391,0.8498013,...,0.7443171,0.3197495,0.2650268,0.600707,0.6837683,0.8882571,0.6040235,0.784679,0.6807476,0.5517533
max,2.467134,0.0,2.433273,1.70526,1.700295,1.727374,1.718994,1.74207,1.712394,1.726186,...,2.160397,2.28026,2.772387,2.566913,3.26747,2.402937,3.886648,3.178866,2.544974,4.386915


In [4]:
expanded_df = expanded_df.drop('done', axis=1)

In [5]:
expanded_df

Unnamed: 0,current_reward,prev_reward,current_state_1,current_state_2,current_state_3,current_state_4,current_state_5,current_state_6,current_state_7,current_state_8,...,prev_state_14,prev_state_15,prev_state_16,prev_state_17,prev_action_1,prev_action_2,prev_action_3,prev_action_4,prev_action_5,prev_action_6
0,-1.323091,-0.950471,-0.272576,-1.701001,-1.269941,0.582522,0.529642,0.389324,-0.445409,1.725297,...,0.578668,-1.194030,0.184086,1.209419,1.373783,0.020068,0.397226,-0.982029,-0.850026,0.350307
1,-1.037890,-1.462812,1.326739,1.471474,-0.470064,0.240357,-0.588327,0.316577,-0.606991,-0.390267,...,-0.841225,0.254198,-0.072943,-1.191217,-0.759454,0.762799,-0.294504,0.181151,0.637417,-0.484872
2,-0.591575,-1.047586,-1.205878,-0.222795,1.032082,-0.936699,-1.515665,-0.338085,-1.099147,-1.441308,...,-0.530839,0.074299,0.291555,1.796288,-1.685172,-0.093831,-0.419836,0.178053,0.884844,0.442093
3,-1.155956,0.432861,1.549789,-0.049213,-0.239077,0.408457,1.725231,1.540156,-0.175788,0.888697,...,0.764318,-0.612568,-0.861120,-0.273492,-0.909154,0.289674,0.079208,-0.285230,-0.216809,-0.260399
4,-1.972636,-1.177665,1.595479,-1.748685,1.262667,1.654896,1.595034,-1.220595,1.633930,1.350546,...,0.120690,0.567307,0.152689,0.120595,-1.205507,0.430454,-1.441041,-0.023542,0.807343,0.345193
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.702189,-0.385321,-0.112793,-0.241961,-1.271933,0.789788,0.271121,0.994129,-0.222410,-0.572468,...,0.309697,-1.059599,-0.465702,-1.631986,1.771411,0.960663,0.157740,-1.858215,-1.959152,-1.300859
1996,0.436071,0.703060,0.297418,0.972961,-1.403345,1.337132,-0.179916,1.420869,1.672574,1.049125,...,0.440128,-1.856293,0.057859,-1.175788,1.683922,0.596628,-0.412273,-0.780045,-1.654445,-1.033115
1997,0.079706,-0.390635,-1.291950,0.211660,-1.489609,0.343988,1.146782,1.203936,-0.213643,0.575951,...,0.053784,-1.027349,-0.686377,0.131564,2.379497,0.831690,-1.487757,-0.477734,-2.493255,0.348237
1998,0.862449,1.788566,-0.064739,-1.421626,-0.796399,1.545395,0.548863,0.813541,-1.131484,1.584200,...,0.640707,0.296109,-1.834003,-0.011890,-0.740945,-0.635055,-0.032840,0.616932,0.333908,0.514703


In [6]:
expanded_df.to_csv('/home/richtsai1103/CRL/src/results/HalfCheetah-v5/ppo_20241212_020732/expanded_steps.csv', 
          index=False,      # Don't save row indices
          header=True,      # Save column names
          encoding='utf-8'  # Specify encoding
)

In [7]:
expanded_df.values

array([[-1.32309149, -0.9504705 , -0.27257593, ..., -0.98202913,
        -0.85002631,  0.35030747],
       [-1.03789028, -1.46281156,  1.32673935, ...,  0.18115108,
         0.63741722, -0.48487182],
       [-0.59157488, -1.04758612, -1.20587779, ...,  0.17805319,
         0.88484421,  0.4420935 ],
       ...,
       [ 0.07970597, -0.39063545, -1.29195047, ..., -0.47773439,
        -2.49325503,  0.34823685],
       [ 0.86244861,  1.7885664 , -0.06473921, ...,  0.61693248,
         0.3339084 ,  0.51470311],
       [ 0.28255017, -0.99808618,  0.11243964, ...,  0.7352853 ,
        -0.98532978, -2.13665087]])

# Problem Formulation

We have a standard Markov Decision Process (MDP) defined by:

$$
(s_{t+1}, r_t) \sim P(s_{t+1}, r_t \mid s_t, a_t)
$$

where:

- $s_t \in S$ is the state,
- $a_t \in A$ is the action taken from the policy $\pi_\theta(a_t \mid s_t)$,
- $r_t \in R$ is the reward,
- $P(s_{t+1} \mid s_t, a_t)$ is the transition function.

We modify this MDP by introducing an augmented state representation:

$$
\tilde{s}_t = (s_t, \hat{p}_t)
$$

where 

$$
\hat{p}_t = f_\phi(s_t, a_t)
$$

is the extra Peek feature predicted by a learned dynamics model $f_\phi$.

The goal is to prove that training PPO on $\tilde{s}_t$ leads to improved policy performance.

## Effect on Policy Gradient Variance

PPO uses the advantage function:

$$
A_t = Q(s_t, a_t) - V(s_t)
$$

where:

- $Q(s_t, a_t)$ is the state-action value function,
- $V(s_t) = \mathbb{E}_{a_t \sim \pi}[Q(s_t, a_t)]$ is the value function.

PPO updates the policy by maximizing the clipped surrogate objective:

$$
L_{\text{PPO}}(\theta) = \mathbb{E}_{s,a \sim \pi} \left[ \min \left( r_t(\theta) A_t, \operatorname{clip}(r_t(\theta), 1 - \epsilon, 1 + \epsilon) A_t \right) \right]
$$

where

$$
 r_t(\theta) = \frac{\pi_\theta(a_t \mid s_t)}{\pi_{\theta_{\text{old}}}(a_t \mid s_t)}
$$

is the probability ratio.

Using the augmented state $\tilde{s}_t$, we redefine:

$$
A_t' = Q(\tilde{s}_t, a_t) - V(\tilde{s}_t)
$$

If $\hat{p}_t$ provides useful predictive information about future rewards, then:

$$
V(\tilde{s}_t) = \mathbb{E}_{a_t \sim \pi}[Q(\tilde{s}_t, a_t)]
$$

is a lower variance estimator of $Q(s_t, a_t)$, because it incorporates additional information.

### Proof by Variance Reduction

By the law of total variance, the variance of the original advantage estimate is:

$$
\operatorname{Var}[A_t] = \operatorname{Var}[Q(s_t, a_t) - V(s_t)]
$$

With the augmented state $\tilde{s}_t$:

$$
\operatorname{Var}[A_t'] = \operatorname{Var}[Q(\tilde{s}_t, a_t) - V(\tilde{s}_t)]
$$

Since $\tilde{s}_t$ includes additional predictive features, the conditional variance satisfies:

$$
\operatorname{Var}[Q(\tilde{s}_t, a_t) \mid \tilde{s}_t] \leq \operatorname{Var}[Q(s_t, a_t) \mid s_t]
$$

This implies:

$$
\operatorname{Var}[A_t'] \leq \operatorname{Var}[A_t]
$$

Since PPO’s policy gradient update depends on the expectation of $A_t$, a lower-variance estimate leads to more stable updates and improved policy convergence.

## Effect on Value Function Approximation

PPO also trains a value function $V_\theta(s_t)$ by minimizing the squared Bellman error:

$$
L_{\text{VF}}(\theta) = \mathbb{E}_{s_t} \left[ (V_\theta(s_t) - R_t)^2 \right]
$$

where $R_t$ is the return.

With the augmented state, the loss function becomes:

$$
L_{\text{VF}}'(\theta) = \mathbb{E}_{\tilde{s}_t} \left[ (V_\theta(\tilde{s}_t) - R_t)^2 \right]
$$

Since $\tilde{s}_t$ includes the predicted feature $\hat{e}_t$, it provides more informative state representations. If $\hat{p}_t$ correlates with long-term returns, then:

$$
\mathbb{E} \left[ (V_\theta(\tilde{s}_t) - R_t)^2 \right] \leq \mathbb{E} \left[ (V_\theta(s_t) - R_t)^2 \right]
$$

which means the value function has lower approximation error.

By improving $V(s_t)$, the advantage estimates become more accurate, leading to better PPO updates.

## Sample Efficiency and Convergence Rate

In reinforcement learning, sample efficiency is often analyzed using policy improvement guarantees.

Define $\pi^*$ as the optimal policy and let $\pi^{(k)}$ be the policy at iteration $k$. PPO ensures monotonic improvement in expected return:

$$
J(\pi^{(k+1)}) \geq J(\pi^{(k)})
$$

However, faster improvement depends on how well the advantage function is estimated.

From our previous variance reduction proof:

$$
\operatorname{Var}[A_t'] \leq \operatorname{Var}[A_t]
$$

which implies that policy updates are less noisy, leading to faster improvement in policy performance.


# Hoeffding’s Inequality

The Hoeffding bound states that if $X_1, X_2, \dots, X_n$ are independent and bounded random variables such that:

$$
a \leq X_i \leq b
$$

for all $i$, then for their empirical mean:

$$
\bar{X} = \frac{1}{n} \sum_{i=1}^{n} X_i
$$

the probability that $\bar{X}$ deviates from its expectation $E[\bar{X}]$ by more than $\epsilon$ is bounded by:

$$
P(\mid \bar{X} - E[\bar{X}] \mid \geq \epsilon) \leq 2 \exp \left( \frac{-2n\epsilon^2}{(b-a)^2} \right)
$$

This tells us that:

- More samples (larger $n$) reduce the probability of deviation.
- Lower variance (smaller $b-a$) leads to a tighter bound, meaning we need fewer samples for the same confidence level.

# Applying Hoeffding’s Bound to PPO Advantage Estimation

In PPO, we estimate the advantage function:

$$
A_t = Q(s_t, a_t) - V(s_t)
$$

where:

- $Q(s_t, a_t)$ is the state-action value function.
- $V(s_t)$ is the value function.

Since PPO updates the policy based on the advantage function $A_t$, accurate estimation of $A_t$ is crucial for stable training.

In practice, $A_t$ is estimated using Monte Carlo rollouts or Generalized Advantage Estimation (GAE), which involves averaging multiple samples:

$$
\hat{A} = \frac{1}{n} \sum_{i=1}^{n} A_i
$$

By Hoeffding’s bound:

$$
P(\mid \hat{A} - E[\hat{A}] \mid \geq \epsilon) \leq 2 \exp \left( \frac{-2n\epsilon^2}{\sigma_A^2} \right)
$$

where $\sigma_A^2$ is the variance of the advantage estimates.

### Key Insight: Reducing $\sigma_A^2$ Lowers Sample Complexity

- If the variance of advantage estimates $\sigma_A^2$ is high, then we need more samples $n$ to achieve a desired error bound $\epsilon$.
- If the variance $\sigma_A^2$ is low, we need fewer samples to reach the same confidence level.

Thus, reducing variance in $A_t$ accelerates PPO training by decreasing the required number of interactions with the environment.

# Effect of Extra Features on Variance Reduction

Now, we analyze how adding extra features $\hat{e}_t = f_\phi(s_t, a_t)$ impacts variance.

The new augmented state representation:

$$
\tilde{s}_t = (s_t, \hat{e}_t)
$$

leads to a better advantage estimate:

$$
A_t' = Q(\tilde{s}_t, a_t) - V(\tilde{s}_t)
$$

If $\hat{e}_t$ contains useful predictive information, it helps in reducing the error of $V(s_t)$:

$$
\operatorname{Var}[V(\tilde{s}_t)] \leq \operatorname{Var}[V(s_t)]
$$

Since $A_t'$ is derived from $Q(\tilde{s}_t, a_t) - V(\tilde{s}_t)$, the variance of the advantage function also decreases:

$$
\operatorname{Var}[A_t'] \leq \operatorname{Var}[A_t]
$$

By Hoeffding’s bound, reducing $\operatorname{Var}[A_t]$ directly reduces the number of samples needed for a given confidence level.

Thus, using extra features in the observation space improves PPO’s sample efficiency, making training faster and more stable.
