In [None]:
import numpy as np
import pandas as pd
import os
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.evaluation import evaluate_policy
from typing import Dict, List, Tuple, Optional

class PortfolioEnv(gym.Env):
    """
    Portfolio allocation environment for reinforcement learning.
    
    This environment simulates a portfolio manager making monthly allocation decisions:
    - Each month, the agent allocates capital to 10 stocks
    - For each stock, it can allocate 0%, 10%, 20%, or 30%
    - Total allocation must equal 100%
    - The goal is to maximize returns while managing risk
    
    A single step represents allocating to one stock.
    A full allocation cycle (10 steps) represents a complete portfolio allocation for one month.
    The reward is calculated at the end of each month based on portfolio performance.
    """
    
    metadata = {'render_modes': ['human']}
    
    def __init__(self, data_dir: str, episode_length: int = 12):
        """
        Initialize the portfolio environment.
        
        Args:
            data_dir: Directory containing CSV files for each stock (stock_0.csv, stock_1.csv, etc.)
            episode_length: Number of months in an episode (default: 12)
        """
        super(PortfolioEnv, self).__init__()
        
        # Load stock data from individual CSV files
        self.stocks = self._load_stock_data(data_dir)
        self.n_stocks = len(self.stocks)
        self.episode_length = episode_length
        
        # Key features used for state representation
        self.features = [
            'Close_scaled', 'MA5_scaled', 'MA20_scaled', 'MA50_scaled', 'MA200_scaled',
            'RSI_scaled', 'BB_width_scaled', 'ATR_scaled', 'Return_1W_scaled',
            'Return_1M_scaled', 'Return_3M_scaled', 'CurrentDrawdown_scaled',
            'MaxDrawdown_252d_scaled', 'Sharpe_20d_scaled', 'Sharpe_60d_scaled'
        ]
        
        # Define observation and action spaces
        # State: 15 features per stock + current stock index + remaining allocation
        obs_dim = len(self.features) * self.n_stocks + 2
        self.observation_space = spaces.Box(
            low=-10, high=10, 
            shape=(obs_dim,), 
            dtype=np.float32
        )
        
        # Action space: 4 options (0%, 10%, 20%, 30%)
        self.action_space = spaces.Discrete(4)
        
        # Determine the valid date range across all stocks
        self.start_date, self.end_date = self._get_common_date_range()
        
        # Initialize
        self.reset()
    
    def _load_stock_data(self, data_dir: str) -> Dict[str, pd.DataFrame]:
        """
        Load stock data from individual CSV files.
        
        Args:
            data_dir: Directory containing CSV files for each stock
            
        Returns:
            Dictionary mapping stock names to DataFrames
        """
        stocks = {}
        
        # Look for CSV files in the directory
        for i in range(10):  # Assuming 10 stocks
            file_path = os.path.join(data_dir, f"stock_{i}.csv")
            
            # If the specific file doesn't exist, look for any CSV in the directory
            if not os.path.exists(file_path):
                # For demo purposes, we'll use the same data for all stocks if individual files don't exist
                csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
                if not csv_files:
                    raise FileNotFoundError(f"No CSV files found in {data_dir}")
                
                file_path = os.path.join(data_dir, csv_files[0])
                print(f"Warning: stock_{i}.csv not found, using {csv_files[0]} instead")
            
            # Load and parse the data
            df = pd.read_csv(file_path)
            
            # Ensure the DataFrame has a Date column and is sorted
            if 'Date' in df.columns:
                df['Date'] = pd.to_datetime(df['Date'])
                df = df.sort_values('Date')
            
            stocks[f'stock_{i}'] = df
        
        return stocks
    
    def _get_common_date_range(self) -> Tuple[str, str]:
        """
        Find the common date range across all stock data.
        
        Returns:
            Tuple of (start_date, end_date) as string dates
        """
        # Get the date range for each stock
        start_dates = []
        end_dates = []
        
        for stock_name, df in self.stocks.items():
            if 'Date' in df.columns:
                start_dates.append(df['Date'].min())
                end_dates.append(df['Date'].max())
            else:
                # If no Date column, use index range
                start_dates.append(0)
                end_dates.append(len(df) - 1)
        
        # Find the common range
        if all(isinstance(date, (pd.Timestamp, np.datetime64)) for date in start_dates):
            start_date = max(start_dates)
            end_date = min(end_dates)
            
            # Ensure there's enough data for a full episode
            min_length = (self.episode_length + 1) * 30  # +1 for lookback
            if (end_date - start_date).days < min_length:
                raise ValueError(f"Common date range too short for episode length {self.episode_length}")
            
            return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
        else:
            # For index-based data, find common range
            min_length = (self.episode_length + 1) * 30
            max_start = max(start_dates)
            min_end = min(end_dates)
            
            if min_end - max_start < min_length:
                raise ValueError(f"Common date range too short for episode length {self.episode_length}")
            
            return str(max_start), str(min_end)
    
    def reset(self, seed=None, options=None):
        """
        Reset environment for new episode.
        
        This method is called at the beginning of each episode. It:
        1. Selects a random starting point in the data
        2. Resets all portfolio allocations
        3. Prepares the initial state
        
        Returns:
            Initial observation and empty info dict
        """
        super().reset(seed=seed)
        
        # Select a random starting point that allows for a full episode
        data_length = len(next(iter(self.stocks.values())))
        max_start_idx = data_length - self.episode_length * 30 - 20  # 20-day lookback
        self.current_step = np.random.randint(20, max_start_idx)
        self.current_month = 0
        
        # Reset allocation process
        self.allocation = np.zeros(self.n_stocks)
        self.remaining_allocation = 100
        self.current_stock_idx = 0
        
        # Reset performance tracking
        self.monthly_returns = []
        
        # Get initial state
        observation = self._get_observation()
        info = {}
        
        return observation, info
    
    def _get_observation(self):
        """
        Create the state observation vector.
        
        The observation includes:
        1. Features for each stock at the current time step
        2. Which stock we're currently allocating to (normalized index)
        3. How much allocation percentage remains
        
        Returns:
            Numpy array of shape (obs_dim,) containing the observation
        """
        observation = []
        
        # Get features for each stock
        for stock_name, stock_data in self.stocks.items():
            current_data = stock_data.iloc[self.current_step]
            # Extract all required features
            stock_features = []
            for feature in self.features:
                if feature in current_data:
                    stock_features.append(current_data[feature])
                else:
                    # Use a default value if feature is missing
                    stock_features.append(0.0)
                    
            observation.extend(stock_features)
        
        # Add contextual information
        observation.append(self.current_stock_idx / self.n_stocks)  # Normalized index
        observation.append(self.remaining_allocation / 100)  # Remaining allocation %
        
        return np.array(observation, dtype=np.float32)
    
    def step(self, action):
        """
        Take an allocation action and advance the environment.
        
        For each step, the agent decides how much to allocate to the current stock.
        After 10 steps (full allocation to all stocks), the environment advances by one month,
        calculates portfolio performance, and gives a reward.
        
        Args:
            action: Integer in [0, 1, 2, 3] corresponding to [0%, 10%, 20%, 30%]
            
        Returns:
            observation: New state observation
            reward: Reward (only non-zero at end of month)
            terminated: Whether episode is terminated
            truncated: Whether episode is truncated
            info: Additional information
        """
        # Convert action to allocation percentage
        allocation_pct = action * 10
        
        # Check if action is valid
        if allocation_pct > self.remaining_allocation:
            # Adjust invalid actions to the maximum possible
            if self.current_stock_idx == self.n_stocks - 1:
                # For the last stock, must use exactly what's left
                allocation_pct = self.remaining_allocation
            else:
                # For other invalid actions, use the highest valid option
                valid_options = [0, 10, 20, 30]
                valid_options = [opt for opt in valid_options if opt <= self.remaining_allocation]
                if valid_options:
                    allocation_pct = max(valid_options)
                else:
                    allocation_pct = 0
        
        # Apply allocation for current stock
        self.allocation[self.current_stock_idx] = allocation_pct
        self.remaining_allocation -= allocation_pct
        self.current_stock_idx += 1
        
        reward = 0
        terminated = False
        truncated = False
        info = {}
        
        # If we've allocated to all stocks (completed a month)
        if self.current_stock_idx == self.n_stocks:
            # For the last stock, adjust allocation to ensure sum is 100%
            actual_sum = np.sum(self.allocation)
            if actual_sum != 100:
                # Adjust the last allocation to make the sum 100%
                self.allocation[-1] += (100 - actual_sum)
                self.remaining_allocation = 0
            
            # Calculate portfolio return for the month
            portfolio_return, stock_returns = self._calculate_monthly_performance()
            
            # Calculate Sharpe and drawdown metrics
            sharpe = self._calculate_portfolio_metric('Sharpe_20d_scaled')
            max_drawdown = self._calculate_portfolio_metric('MaxDrawdown_252d_scaled')
            
            # Calculate reward
            reward = self._calculate_reward(portfolio_return, sharpe, max_drawdown)
            
            # Track this month's return
            self.monthly_returns.append(portfolio_return)
            
            # Add portfolio info
            info = {
                'portfolio_return': portfolio_return,
                'sharpe': sharpe,
                'max_drawdown': max_drawdown,
                'allocation': self.allocation.copy(),
                'stock_returns': stock_returns
            }
            
            # Advance to next month
            self.current_step += 30
            self.current_month += 1
            
            # Check if episode is done
            terminated = (self.current_month >= self.episode_length)
            
            # Reset for next month's allocation if not done
            self.current_stock_idx = 0
            self.remaining_allocation = 100
        
        # Get new state
        observation = self._get_observation()
        
        return observation, reward, terminated, truncated, info
    
    def _calculate_monthly_performance(self):
        """
        Calculate the portfolio return for the current month.
        
        Returns:
            Tuple of (portfolio_return, stock_returns)
        """
        # Get current prices
        current_prices = np.array([
            self.stocks[f'stock_{i}'].iloc[self.current_step]['Close'] 
            for i in range(self.n_stocks)
        ])
        
        # Get prices at the end of the month
        next_step = min(self.current_step + 30, len(next(iter(self.stocks.values()))) - 1)
        next_prices = np.array([
            self.stocks[f'stock_{i}'].iloc[next_step]['Close'] 
            for i in range(self.n_stocks)
        ])
        
        # Calculate individual stock returns
        stock_returns = (next_prices - current_prices) / current_prices
        
        # Calculate weighted portfolio return
        portfolio_return = np.sum((self.allocation / 100) * stock_returns)
        
        return portfolio_return, stock_returns
    
    def _calculate_portfolio_metric(self, metric_name):
        """
        Calculate a weighted portfolio metric based on current allocations.
        
        Args:
            metric_name: Name of the metric to calculate (e.g., 'Sharpe_20d_scaled')
            
        Returns:
            Weighted average of the metric across all stocks
        """
        # Check if all stocks have this metric
        if not all(metric_name in stock_df.columns for stock_df in self.stocks.values()):
            return 0.0  # Default value if metric not available
        
        metric_values = np.array([
            self.stocks[f'stock_{i}'].iloc[self.current_step][metric_name] 
            for i in range(self.n_stocks)
        ])
        return np.sum((self.allocation / 100) * metric_values)
    
    def _calculate_reward(self, portfolio_return, sharpe, max_drawdown):
        """
        Calculate the reward based on portfolio performance.
        
        The reward function balances:
        1. Return above target (1% monthly)
        2. Risk-adjusted performance (Sharpe ratio)
        3. Downside protection (avoiding drawdowns)
        
        Args:
            portfolio_return: Monthly portfolio return
            sharpe: Sharpe ratio
            max_drawdown: Maximum drawdown
            
        Returns:
            Calculated reward value
        """
        # Base reward centered around 1% monthly return target
        base_reward = (portfolio_return - 0.01) * 100
        
        # Adjust for risk metrics
        risk_adjustment = sharpe * 0.5
        drawdown_penalty = max_drawdown * -2.0
        
        # Extra penalty for negative returns
        if portfolio_return < 0:
            base_reward *= 1.5
        
        return base_reward + risk_adjustment + drawdown_penalty
    
    def render(self, mode='human'):
        """Render the current state of the environment"""
        if self.current_stock_idx == 0:  # Just completed a month's allocation
            print(f"Month {self.current_month}")
            print(f"Allocation: {self.allocation}")
            if self.monthly_returns:
                print(f"Last month return: {self.monthly_returns[-1]:.4f}")
    
    def close(self):
        """Clean up resources"""
        pass


def train_and_evaluate(data_dir, save_dir='./models', total_timesteps=100000, eval_episodes=10):
    """
    Train and evaluate a PPO agent on the portfolio allocation problem.
    
    This function:
    1. Creates the portfolio environment
    2. Validates that the environment works properly
    3. Trains a PPO agent with the specified hyperparameters
    4. Evaluates the trained agent
    5. Plots the performance and allocation results
    
    Args:
        data_dir: Directory containing stock CSV files
        save_dir: Directory to save model checkpoints
        total_timesteps: Number of timesteps to train for
        eval_episodes: Number of episodes to use for evaluation
        
    Returns:
        Trained model and evaluation results
    """
    # Create and validate environment
    print("Creating environment...")
    env = PortfolioEnv(data_dir)
    
    # Basic environment validation
    check_env(env)
    
    # Initialize agent
    print("Initializing PPO agent...")
    model = PPO(
        "MlpPolicy", 
        env,
        verbose=1,
        learning_rate=3e-4,
        gamma=0.99,  # Discount factor
        n_steps=2048,  # Steps to collect before updating
        ent_coef=0.01,  # Entropy coefficient (exploration)
        vf_coef=0.5,  # Value function coefficient
        max_grad_norm=0.5,  # Gradient clipping
        policy_kwargs={'net_arch': [256, 128, dict(vf=[64], pi=[64])]}  # Network architecture
    )
    
    # Create checkpoint callback for saving models
    checkpoint_callback = CheckpointCallback(
        save_freq=10000,  # Save every 10000 steps
        save_path=save_dir,
        name_prefix="ppo_portfolio",
        save_replay_buffer=False,
        save_vecnormalize=True,
    )
    
    # Train the agent
    print(f"Training for {total_timesteps} timesteps...")
    model.learn(
        total_timesteps=total_timesteps,
        callback=checkpoint_callback,
        progress_bar=True
    )
    
    # Save the final model
    final_model_path = os.path.join(save_dir, "ppo_portfolio_final")
    model.save(final_model_path)
    print(f"Final model saved to {final_model_path}")
    
    # Evaluate the trained agent
    print(f"Evaluating agent over {eval_episodes} episodes...")
    eval_env = PortfolioEnv(data_dir)  # Create a separate env for evaluation
    
    # Basic evaluation for average reward
    mean_reward, std_reward = evaluate_policy(
        model, 
        eval_env, 
        n_eval_episodes=eval_episodes,
        deterministic=True
    )
    
    print(f"Mean reward: {mean_reward:.4f} ± {std_reward:.4f}")
    
    # Detailed evaluation for visualization
    results = detailed_evaluation(model, eval_env, n_episodes=eval_episodes)
    
    # Print summary statistics
    print("\nPerformance Summary:")
    print(f"Average Monthly Return: {results['mean_return']:.4f}")
    print(f"Average Sharpe Ratio: {results['mean_sharpe']:.4f}")
    print(f"Average Max Drawdown: {results['mean_drawdown']:.4f}")
    
    return model, results


def detailed_evaluation(model, env, n_episodes=10):
    """
    Perform a detailed evaluation of the trained model.
    
    This function:
    1. Runs the model for multiple episodes
    2. Collects detailed performance metrics
    3. Generates visualizations of allocations and returns
    
    Args:
        model: Trained PPO model
        env: Portfolio environment
        n_episodes: Number of episodes to evaluate
        
    Returns:
        Dictionary containing evaluation results
    """
    # Storage for results
    all_allocations = []
    all_returns = []
    all_sharpes = []
    all_drawdowns = []
    monthly_allocations = []
    
    for episode in range(n_episodes):
        obs, info = env.reset()
        episode_allocations = []
        episode_returns = []
        episode_sharpes = []
        episode_drawdowns = []
        done = False
        
        while not done:
            # Get action from model
            action, _ = model.predict(obs, deterministic=True)
            
            # Take step in environment
            obs, reward, terminated, truncated, info = env.step(action)
            
            done = terminated or truncated
            
            # If we just completed a month, save the results
            if 'portfolio_return' in info:
                episode_allocations.append(info['allocation'])
                episode_returns.append(info['portfolio_return'])
                episode_sharpes.append(info['sharpe'])
                episode_drawdowns.append(info['max_drawdown'])
        
        # Save episode results
        all_allocations.append(episode_allocations)
        monthly_allocations.extend(episode_allocations)
        all_returns.append(np.mean(episode_returns))
        all_sharpes.append(np.mean(episode_sharpes))
        all_drawdowns.append(np.mean(episode_drawdowns))
        
        print(f"Episode {episode+1}: Return = {np.mean(episode_returns):.4f}, Sharpe = {np.mean(episode_sharpes):.4f}")
    
    # Calculate average allocation across all months
    avg_allocation = np.mean(monthly_allocations, axis=0)
    
    # Create visualizations
    create_visualizations(avg_allocation, all_returns, all_sharpes, all_drawdowns)
    
    # Return summary results
    return {
        'mean_return': np.mean(all_returns),
        'mean_sharpe': np.mean(all_sharpes),
        'mean_drawdown': np.mean(all_drawdowns),
        'avg_allocation': avg_allocation
    }


def create_visualizations(avg_allocation, returns, sharpes, drawdowns):
    """
    Create and save visualizations of the evaluation results.
    
    Args:
        avg_allocation: Average allocation percentages across all episodes
        returns: List of average returns for each episode
        sharpes: List of average Sharpe ratios for each episode
        drawdowns: List of average maximum drawdowns for each episode
    """
    # Create output directory if it doesn't exist
    os.makedirs('results', exist_ok=True)
    
    # Plot 1: Average allocation
    plt.figure(figsize=(12, 6))
    bars = plt.bar(range(len(avg_allocation)), avg_allocation)
    
    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                 f'{height:.1f}%',
                 ha='center', va='bottom', rotation=0)
    
    plt.xlabel('Stock')
    plt.ylabel('Average Allocation (%)')
    plt.title('Average Portfolio Allocation')
    plt.xticks(range(len(avg_allocation)), [f'Stock {i}' for i in range(len(avg_allocation))])
    plt.ylim(0, max(avg_allocation) * 1.2)  # Give some headroom for labels
    plt.savefig('results/portfolio_allocation.png')
    
    # Plot 2: Returns distribution
    plt.figure(figsize=(10, 6))
    plt.hist(returns, bins=10, alpha=0.7)
    plt.axvline(np.mean(returns), color='r', linestyle='dashed', linewidth=2)
    plt.text(np.mean(returns)*1.1, plt.ylim()[1]*0.9, f'Mean: {np.mean(returns):.4f}')
    plt.xlabel('Monthly Return')
    plt.ylabel('Frequency')
    plt.title('Distribution of Monthly Returns')
    plt.savefig('results/returns_distribution.png')
    
    # Plot 3: Performance metrics comparison
    plt.figure(figsize=(12, 6))
    metrics = ['Return', 'Sharpe', 'Drawdown']
    values = [np.mean(returns), np.mean(sharpes), np.mean(drawdowns)]
    colors = ['green', 'blue', 'red']
    
    bars = plt.bar(metrics, values, color=colors)
    plt.title('Average Performance Metrics')
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.001,
                 f'{height:.4f}',
                 ha='center', va='bottom')
    
    plt.savefig('results/performance_metrics.png')
    
    print("Visualizations saved to 'results' directory")


def main():
    """
    Main entry point for the program.
    
    This function:
    1. Gets the path to stock data
    2. Trains and evaluates the model
    """
    # Get directory containing stock CSV files
    data_dir = input("Enter path to directory containing stock CSV files: ")
    
    if not os.path.exists(data_dir):
        print(f"Error: Directory {data_dir} does not exist")
        return
    
    # Create directory for model checkpoints
    save_dir = './models'
    os.makedirs(save_dir, exist_ok=True)
    
    # Train and evaluate model
    train_and_evaluate(
        data_dir=data_dir,
        save_dir=save_dir,
        total_timesteps=100000,  # Adjust based on available computational resources
        eval_episodes=10
    )
    
    print("Training and evaluation complete!")


if __name__ == "__main__":
    main()