In [None]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy
from typing import Dict, List, Tuple, Optional

class PortfolioEnv(gym.Env):
    """
    Portfolio allocation environment using Gymnasium interface.
    
    The agent makes sequential allocation decisions for 10 stocks,
    with options of 0%, 10%, 20%, or 30% per stock, 
    with the constraint that allocations must sum to 100%.
    """
    
    metadata = {'render_modes': ['human']}
    
    def __init__(self, data_path: str, n_stocks: int = 10, episode_length: int = 12):
        """
        Args:
            data_path: Path to the CSV data file
            n_stocks: Number of stocks to allocate (default: 10)
            episode_length: Length of episode in months (default: 12)
        """
        super(PortfolioEnv, self).__init__()
        
        # Load and prepare data
        self.raw_data = pd.read_csv(data_path)
        self.n_stocks = n_stocks
        self.episode_length = episode_length
        
        # Create synthetic stocks (in practice, would use real data for each stock)
        self.stocks = {}
        for i in range(n_stocks):
            # For demonstration, we create slightly different versions of the data
            # In a real scenario, you would load separate data for each stock
            stock_data = self.raw_data.copy()
            # Add a small random variation to make each "stock" slightly different
            stock_data['Close'] = stock_data['Close'] * (1 + np.random.normal(0, 0.05))
            self.stocks[f'stock_{i}'] = stock_data
        
        # Key features for state representation
        self.features = [
            'Close_scaled', 'MA5_scaled', 'MA20_scaled', 'MA50_scaled', 'MA200_scaled',
            'RSI_scaled', 'BB_width_scaled', 'ATR_scaled', 'Return_1W_scaled',
            'Return_1M_scaled', 'Return_3M_scaled', 'CurrentDrawdown_scaled',
            'MaxDrawdown_252d_scaled', 'Sharpe_20d_scaled', 'Sharpe_60d_scaled'
        ]
        
        # Define observation and action spaces
        # State: 15 features per stock + current stock index + remaining allocation
        obs_dim = len(self.features) * n_stocks + 2
        self.observation_space = spaces.Box(
            low=-10, high=10, 
            shape=(obs_dim,), 
            dtype=np.float32
        )
        
        # Action space: 4 options (0%, 10%, 20%, 30%)
        self.action_space = spaces.Discrete(4)
        
        # Initialize
        self.reset()
    
    def reset(self, seed=None, options=None):
        """Reset environment for new episode"""
        super().reset(seed=seed)
        
        # Find a random starting point that allows for a full episode
        data_length = len(self.raw_data)
        max_start_idx = data_length - self.episode_length * 30 - 20  # 20-day lookback
        self.current_step = np.random.randint(20, max_start_idx)
        self.current_month = 0
        
        # Reset allocation process
        self.allocation = np.zeros(self.n_stocks)
        self.remaining_allocation = 100
        self.current_stock_idx = 0
        
        # Get initial state
        observation = self._get_observation()
        info = {}
        
        return observation, info
    
    def _get_observation(self):
        """Get current state representation"""
        observation = []
        
        # Get features for each stock
        for stock_name, stock_data in self.stocks.items():
            current_data = stock_data.iloc[self.current_step]
            # Extract all required features
            stock_features = [current_data[feature] for feature in self.features]
            observation.extend(stock_features)
        
        # Add contextual information
        observation.append(self.current_stock_idx / self.n_stocks)  # Normalized index
        observation.append(self.remaining_allocation / 100)  # Remaining allocation %
        
        return np.array(observation, dtype=np.float32)
    
    def step(self, action):
        """
        Take allocation action for current stock and move to next
        
        Args:
            action: Integer in [0, 1, 2, 3] corresponding to [0%, 10%, 20%, 30%]
            
        Returns:
            observation: New state observation
            reward: Reward (only non-zero at end of month)
            terminated: Whether episode is terminated
            truncated: Whether episode is truncated
            info: Additional information
        """
        # Convert action to allocation percentage
        allocation_pct = action * 10
        
        # Check if action is valid
        if allocation_pct > self.remaining_allocation:
            # Adjust invalid actions to the maximum possible
            if self.current_stock_idx == self.n_stocks - 1:
                # For the last stock, must use exactly what's left
                allocation_pct = self.remaining_allocation
            else:
                # For other invalid actions, use the highest valid option
                valid_options = [0, 10, 20, 30]
                valid_options = [opt for opt in valid_options if opt <= self.remaining_allocation]
                if valid_options:
                    allocation_pct = max(valid_options)
                else:
                    allocation_pct = 0
        
        # Apply allocation for current stock
        self.allocation[self.current_stock_idx] = allocation_pct
        self.remaining_allocation -= allocation_pct
        self.current_stock_idx += 1
        
        reward = 0
        terminated = False
        truncated = False
        info = {}
        
        # If we've allocated to all stocks (completed a month)
        if self.current_stock_idx == self.n_stocks:
            # For the last stock, adjust allocation to ensure sum is 100%
            actual_sum = np.sum(self.allocation)
            if actual_sum != 100:
                # Adjust the last allocation to make the sum 100%
                self.allocation[-1] += (100 - actual_sum)
                self.remaining_allocation = 0
            
            # Calculate portfolio return for the month
            current_prices = np.array([
                self.stocks[f'stock_{i}'].iloc[self.current_step]['Close'] 
                for i in range(self.n_stocks)
            ])
            
            # Move to end of month (~30 trading days)
            next_step = min(self.current_step + 30, len(self.raw_data) - 1)
            next_prices = np.array([
                self.stocks[f'stock_{i}'].iloc[next_step]['Close'] 
                for i in range(self.n_stocks)
            ])
            
            # Calculate stock returns and portfolio return
            stock_returns = (next_prices - current_prices) / current_prices
            portfolio_return = np.sum((self.allocation / 100) * stock_returns)
            
            # Calculate Sharpe and drawdown metrics
            sharpe = self._calculate_portfolio_metric('Sharpe_20d_scaled')
            max_drawdown = self._calculate_portfolio_metric('MaxDrawdown_252d_scaled')
            
            # Calculate reward
            reward = self._calculate_reward(portfolio_return, sharpe, max_drawdown)
            
            # Add portfolio info
            info = {
                'portfolio_return': portfolio_return,
                'sharpe': sharpe,
                'max_drawdown': max_drawdown,
                'allocation': self.allocation.copy()
            }
            
            # Move to next month
            self.current_step = next_step
            self.current_month += 1
            
            # Check if episode is done
            terminated = (self.current_month >= self.episode_length)
            
            # Reset for next month's allocation if not done
            self.current_stock_idx = 0
            self.remaining_allocation = 100
        
        # Get new state
        observation = self._get_observation()
        
        return observation, reward, terminated, truncated, info
    
    def _calculate_portfolio_metric(self, metric_name):
        """Calculate weighted portfolio metric"""
        metric_values = np.array([
            self.stocks[f'stock_{i}'].iloc[self.current_step][metric_name] 
            for i in range(self.n_stocks)
        ])
        return np.sum((self.allocation / 100) * metric_values)
    
    def _calculate_reward(self, portfolio_return, sharpe, max_drawdown):
        """Calculate reward based on return, Sharpe ratio, and drawdown"""
        # Base reward centered around 1% monthly return target
        base_reward = (portfolio_return - 0.01) * 100
        
        # Adjust for risk metrics
        risk_adjustment = sharpe * 0.5
        drawdown_penalty = max_drawdown * -2.0
        
        # Extra penalty for negative returns
        if portfolio_return < 0:
            base_reward *= 1.5
        
        return base_reward + risk_adjustment + drawdown_penalty
    
    def render(self, mode='human'):
        """Render the environment state"""
        if self.current_stock_idx == 0:  # Just allocated a full portfolio
            print(f"Month {self.current_month}, Allocation: {self.allocation}")
    
    def close(self):
        """Clean up resources"""
        pass


class TensorboardCallback(BaseCallback):
    """
    Custom callback for logging useful metrics
    """
    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)
    
    def _on_step(self) -> bool:
        # Log portfolio information if available
        if 'portfolio_return' in self.locals['infos'][-1]:
            info = self.locals['infos'][-1]
            self.logger.record('portfolio/return', info['portfolio_return'])
            self.logger.record('portfolio/sharpe', info['sharpe'])
            self.logger.record('portfolio/max_drawdown', info['max_drawdown'])
        return True


def train_model(env, total_timesteps=100000, log_dir='./logs/'):
    """Train a PPO model on the portfolio environment"""
    # Initialize the model
    model = PPO(
        "MlpPolicy", 
        env,
        verbose=1,
        tensorboard_log=log_dir,
        learning_rate=3e-4,
        gamma=0.99,
        n_steps=2048,
        ent_coef=0.01,
        vf_coef=0.5,
        max_grad_norm=0.5,
        gae_lambda=0.95,
        clip_range=0.2,
        clip_range_vf=None,
        normalize_advantage=True,
        policy_kwargs={'net_arch': [256, 128, dict(vf=[64], pi=[64])]}
    )
    
    # Train the model
    model.learn(
        total_timesteps=total_timesteps,
        callback=TensorboardCallback(),
        progress_bar=True
    )
    
    return model


def evaluate_model(model, env, n_episodes=10):
    """Evaluate the trained model"""
    # Evaluate the model
    mean_reward, std_reward = evaluate_policy(
        model, 
        env, 
        n_eval_episodes=n_episodes,
        deterministic=True
    )
    
    print(f"Mean reward: {mean_reward:.3f} +/- {std_reward:.3f}")
    
    # Run detailed evaluation for visualization
    allocations = []
    returns = []
    sharpes = []
    drawdowns = []
    
    obs, info = env.reset()
    
    for i in range(n_episodes):
        episode_allocations = []
        episode_returns = []
        episode_sharpes = []
        episode_drawdowns = []
        done = False
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            
            done = terminated or truncated
            
            if 'portfolio_return' in info:
                episode_allocations.append(info['allocation'])
                episode_returns.append(info['portfolio_return'])
                episode_sharpes.append(info['sharpe'])
                episode_drawdowns.append(info['max_drawdown'])
        
        allocations.append(episode_allocations)
        returns.append(np.mean(episode_returns))
        sharpes.append(np.mean(episode_sharpes))
        drawdowns.append(np.mean(episode_drawdowns))
        
        obs, info = env.reset()
    
    # Plot average allocation
    avg_allocations = np.mean([np.mean(ep_allocs, axis=0) for ep_allocs in allocations], axis=0)
    
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(avg_allocations)), avg_allocations)
    plt.xlabel('Stock')
    plt.ylabel('Average Allocation (%)')
    plt.title('Average Portfolio Allocation')
    plt.xticks(range(len(avg_allocations)), [f'Stock {i}' for i in range(len(avg_allocations))])
    plt.savefig('portfolio_allocation.png')
    
    # Plot returns distribution
    plt.figure(figsize=(10, 6))
    plt.hist(returns, bins=10)
    plt.xlabel('Monthly Return')
    plt.ylabel('Frequency')
    plt.title(f'Distribution of Monthly Returns (Mean: {np.mean(returns):.4f})')
    plt.savefig('returns_distribution.png')
    
    return {
        'mean_return': np.mean(returns),
        'mean_sharpe': np.mean(sharpes),
        'mean_drawdown': np.mean(drawdowns),
        'allocations': avg_allocations
    }


def validate_env(env):
    """Validate the environment works correctly"""
    # Check the environment follows Gym API
    check_env(env)
    
    # Test a few steps
    obs, info = env.reset()
    
    print("Initial observation shape:", obs.shape)
    
    for i in range(5):
        action = env.action_space.sample()  # Random action
        obs, reward, terminated, truncated, info = env.step(action)
        print(f"Step {i}: Action={action}, Reward={reward}")
    
    print("Validation complete!")


def main():
    # Path to the dataset
    data_path = 'paste.txt'
    
    # Create environment
    env = PortfolioEnv(data_path)
    
    # Validate environment
    print("Validating environment...")
    validate_env(env)
    
    # Train model
    print("Training PPO agent...")
    model = train_model(env, total_timesteps=100000)
    
    # Save the model
    model.save("ppo_portfolio")
    
    # Evaluate the model
    print("Evaluating model...")
    results = evaluate_model(model, env)
    
    print("Training and evaluation complete!")
    print(f"Average monthly return: {results['mean_return']:.4f}")
    print(f"Average Sharpe ratio: {results['mean_sharpe']:.4f}")
    print(f"Average max drawdown: {results['mean_drawdown']:.4f}")


if __name__ == "__main__":
    main()