In [None]:
import numpy as np
import pandas as pd
import os
import gymnasium as gym
from gymnasium import spaces
from scipy.special import softmax
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.evaluation import evaluate_policy
from typing import Dict, List, Tuple, Optional

class PortfolioEnv(gym.Env):
    metadata = {'render_modes': ['human']}
    
    def __init__(self, data_dir: str, episode_length: int = 12, temperature: float = 0.3):
        super(PortfolioEnv, self).__init__()
        
        self.stocks = self._load_stock_data(data_dir)
        self.n_stocks = len(self.stocks)
        self.episode_length = episode_length
        self.temperature = temperature
        
        self.features = [
            'Close_scaled', 'MA5_scaled', 'MA20_scaled', 'MA50_scaled', 'MA200_scaled',
            'RSI_scaled', 'BB_width_scaled', 'ATR_scaled', 'Return_1W_scaled',
            'Return_1M_scaled', 'Return_3M_scaled', 'CurrentDrawdown_scaled',
            'MaxDrawdown_252d_scaled', 'Sharpe_20d_scaled', 'Sharpe_60d_scaled'
        ]
        
        obs_dim = len(self.features) * self.n_stocks
        self.observation_space = spaces.Box(
            low=-10, high=10, 
            shape=(obs_dim,), 
            dtype=np.float32
        )
        
        self.action_space = spaces.Box(
            low=0, high=1,
            shape=(self.n_stocks,),
            dtype=np.float32
        )
        
        self.start_date, self.end_date = self._get_common_date_range()
        
        self.reset()
    
    def _load_stock_data(self, data_dir: str) -> Dict[str, pd.DataFrame]:
        stocks = {}
        
        for i in range(10):
            file_path = os.path.join(data_dir, f"stock_{i}.csv")
            
            if not os.path.exists(file_path):
                csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
                if not csv_files:
                    raise FileNotFoundError(f"No CSV files found in {data_dir}")
                
                file_path = os.path.join(data_dir, csv_files[0])
                print(f"Warning: stock_{i}.csv not found, using {csv_files[0]} instead")
            
            df = pd.read_csv(file_path)
            
            if 'Date' in df.columns:
                df['Date'] = pd.to_datetime(df['Date'])
                df = df.sort_values('Date')
            
            stocks[f'stock_{i}'] = df
        
        return stocks
    
    def _get_common_date_range(self) -> Tuple[str, str]:
        start_dates = []
        end_dates = []
        
        for stock_name, df in self.stocks.items():
            if 'Date' in df.columns:
                start_dates.append(df['Date'].min())
                end_dates.append(df['Date'].max())
            else:
                start_dates.append(0)
                end_dates.append(len(df) - 1)
        
        if all(isinstance(date, (pd.Timestamp, np.datetime64)) for date in start_dates):
            start_date = max(start_dates)
            end_date = min(end_dates)
            
            min_length = (self.episode_length + 1) * 30
            if (end_date - start_date).days < min_length:
                raise ValueError(f"Common date range too short for episode length {self.episode_length}")
            
            return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
        else:
            min_length = (self.episode_length + 1) * 30
            max_start = max(start_dates)
            min_end = min(end_dates)
            
            if min_end - max_start < min_length:
                raise ValueError(f"Common date range too short for episode length {self.episode_length}")
            
            return str(max_start), str(min_end)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        
        data_length = len(next(iter(self.stocks.values())))
        max_start_idx = data_length - self.episode_length * 30 - 20
        self.current_step = np.random.randint(20, max_start_idx)
        self.current_month = 0
        
        self.monthly_returns = []
        self.portfolio_value = 100.0
        self.previous_allocation = np.zeros(self.n_stocks)
        
        observation = self._get_observation()
        info = {}
        
        return observation, info
    
    def _get_observation(self):
        observation = []
        
        for stock_name, stock_data in self.stocks.items():
            current_data = stock_data.iloc[self.current_step]
            stock_features = []
            for feature in self.features:
                if feature in current_data:
                    stock_features.append(current_data[feature])
                else:
                    stock_features.append(0.0)
                    
            observation.extend(stock_features)
        
        return np.array(observation, dtype=np.float32)
    
    def _convert_to_allocation(self, action_weights):
        """
        Improved allocation conversion that maintains the relationship between
        continuous actions and final allocations while enforcing constraints.
        """
        # Apply softmax with temperature scaling
        raw_allocation = softmax(np.array(action_weights) / self.temperature)
        
        # Calculate initial percent allocations
        percentages = raw_allocation * 100
        
        # Apply discretization constraint (0%, 10%, 20%, 30%)
        # First, find the nearest valid allocation (multiples of 10%)
        allocations = np.round(percentages / 10) * 10
        allocations = np.clip(allocations, 0, 30)
        
        # Determine adjustment needed to sum to 100%
        total_allocation = np.sum(allocations)
        adjustment_needed = 100 - total_allocation
        
        if adjustment_needed != 0:
            # Calculate how close each stock was to the next discretization level
            distance_to_next = np.zeros_like(allocations)
            
            for i in range(len(allocations)):
                if adjustment_needed > 0 and allocations[i] < 30:
                    # If we need to add: how close to rounding up?
                    distance_to_next[i] = 10 - (percentages[i] % 10)
                elif adjustment_needed < 0 and allocations[i] > 0:
                    # If we need to subtract: how close to rounding down?
                    distance_to_next[i] = percentages[i] % 10
                else:
                    # Can't adjust this stock
                    distance_to_next[i] = float('inf')
            
            # Prioritize adjustments for stocks closest to the next level
            num_adjustments = abs(adjustment_needed) // 10
            adjustment_indices = np.argsort(distance_to_next)[:num_adjustments]
            
            for idx in adjustment_indices:
                if adjustment_needed > 0 and allocations[idx] < 30:
                    allocations[idx] += 10
                    adjustment_needed -= 10
                elif adjustment_needed < 0 and allocations[idx] > 0:
                    allocations[idx] -= 10
                    adjustment_needed += 10
        
        return allocations
    
    def step(self, action):
        allocation = self._convert_to_allocation(action)
        
        self.previous_allocation = allocation.copy()
        
        portfolio_return, stock_returns = self._calculate_monthly_performance(allocation)
        
        self.portfolio_value *= (1 + portfolio_return)
        
        sharpe = self._calculate_portfolio_metric('Sharpe_20d_scaled', allocation)
        max_drawdown = self._calculate_portfolio_metric('MaxDrawdown_252d_scaled', allocation)
        
        reward = self._calculate_reward(portfolio_return, sharpe, max_drawdown)
        
        self.monthly_returns.append(portfolio_return)
        
        info = {
            'portfolio_return': portfolio_return,
            'portfolio_value': self.portfolio_value,
            'sharpe': sharpe,
            'max_drawdown': max_drawdown,
            'allocation': allocation.copy(),
            'stock_returns': stock_returns
        }
        
        self.current_step += 30
        self.current_month += 1
        
        terminated = (self.current_month >= self.episode_length)
        truncated = False
        
        observation = self._get_observation()
        
        return observation, reward, terminated, truncated, info
    
    def _calculate_monthly_performance(self, allocation):
        current_prices = np.array([
            self.stocks[f'stock_{i}'].iloc[self.current_step]['Close'] 
            for i in range(self.n_stocks)
        ])
        
        next_step = min(self.current_step + 30, len(next(iter(self.stocks.values()))) - 1)
        next_prices = np.array([
            self.stocks[f'stock_{i}'].iloc[next_step]['Close'] 
            for i in range(self.n_stocks)
        ])
        
        stock_returns = (next_prices - current_prices) / current_prices
        
        portfolio_return = np.sum((allocation / 100) * stock_returns)
        
        return portfolio_return, stock_returns
    
    def _calculate_portfolio_metric(self, metric_name, allocation):
        if not all(metric_name in stock_df.columns for stock_df in self.stocks.values()):
            return 0.0
        
        metric_values = np.array([
            self.stocks[f'stock_{i}'].iloc[self.current_step][metric_name] 
            for i in range(self.n_stocks)
        ])
        return np.sum((allocation / 100) * metric_values)
    
    def _calculate_reward(self, portfolio_return, sharpe, max_drawdown):
        """
        Calculate reward with more balanced risk-return consideration.
        Uses dynamic benchmarking rather than fixed thresholds.
        """
        # Get the average return across all stocks as a benchmark
        benchmark_returns = np.mean([
            self.stocks[f'stock_{i}'].iloc[self.current_step].get('Return_1M_scaled', 0)
            for i in range(self.n_stocks)
        ])
        
        # Calculate excess return over benchmark
        excess_return = portfolio_return - max(0, benchmark_returns * 0.01)  # Scaled benchmark
        
        # Base reward from excess return (higher weight for outperformance)
        base_reward = excess_return * 100
        
        # Risk-adjusted components
        sharpe_component = sharpe * 1.0  # Increased weight on Sharpe
        drawdown_component = max_drawdown * -1.5  # Slightly reduced drawdown penalty
        
        # Apply higher penalty for large drawdowns but lower for small ones
        if max_drawdown < -0.1:  # Only penalize significant drawdowns
            drawdown_component *= 1.5
        
        # Combine components
        reward = base_reward + sharpe_component + drawdown_component
        
        return reward
    
    def render(self, mode='human'):
        print(f"Month {self.current_month}")
        print(f"Allocation: {self.previous_allocation}")
        if self.monthly_returns:
            print(f"Last month return: {self.monthly_returns[-1]:.4f}")
            print(f"Portfolio value: {self.portfolio_value:.2f}")
    
    def close(self):
        pass


def train_and_evaluate(data_dir, save_dir='./models', total_timesteps=100000, eval_episodes=10, temperature=0.3):
    print("Creating environment...")
    env = PortfolioEnv(data_dir, temperature=temperature)
    
    check_env(env)
    
    print("Initializing PPO agent...")
    model = PPO(
        "MlpPolicy", 
        env,
        verbose=1,
        learning_rate=3e-4,
        gamma=0.99,
        n_steps=2048,
        ent_coef=0.01,
        vf_coef=0.5,
        max_grad_norm=0.5,
        policy_kwargs={'net_arch': [256, 128, dict(vf=[64], pi=[64])]}
    )
    
    checkpoint_callback = CheckpointCallback(
        save_freq=10000,
        save_path=save_dir,
        name_prefix="ppo_portfolio",
        save_replay_buffer=False,
        save_vecnormalize=True,
    )
    
    print(f"Training for {total_timesteps} timesteps...")
    model.learn(
        total_timesteps=total_timesteps,
        callback=checkpoint_callback,
        progress_bar=True
    )
    
    final_model_path = os.path.join(save_dir, "ppo_portfolio_final")
    model.save(final_model_path)
    print(f"Final model saved to {final_model_path}")
    
    print(f"Evaluating agent over {eval_episodes} episodes...")
    eval_env = PortfolioEnv(data_dir, temperature=temperature)
    
    mean_reward, std_reward = evaluate_policy(
        model, 
        eval_env, 
        n_eval_episodes=eval_episodes,
        deterministic=True
    )
    
    print(f"Mean reward: {mean_reward:.4f} ± {std_reward:.4f}")
    
    results = detailed_evaluation(model, eval_env, n_episodes=eval_episodes)
    
    print("\nPerformance Summary:")
    print(f"Average Monthly Return: {results['mean_return']:.4f}")
    print(f"Average Sharpe Ratio: {results['mean_sharpe']:.4f}")
    print(f"Average Max Drawdown: {results['mean_drawdown']:.4f}")
    print(f"Final Average Portfolio Value: ${results['mean_final_value']:.2f}")
    
    return model, results


def detailed_evaluation(model, env, n_episodes=10):
    all_allocations = []
    all_returns = []
    all_sharpes = []
    all_drawdowns = []
    monthly_allocations = []
    final_values = []
    
    for episode in range(n_episodes):
        obs, info = env.reset()
        episode_allocations = []
        episode_returns = []
        episode_sharpes = []
        episode_drawdowns = []
        done = False
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            
            obs, reward, terminated, truncated, info = env.step(action)
            
            done = terminated or truncated
            
            episode_allocations.append(info['allocation'])
            episode_returns.append(info['portfolio_return'])
            episode_sharpes.append(info['sharpe'])
            episode_drawdowns.append(info['max_drawdown'])
            
            if done:
                final_values.append(info['portfolio_value'])
        
        all_allocations.append(episode_allocations)
        monthly_allocations.extend(episode_allocations)
        all_returns.append(np.mean(episode_returns))
        all_sharpes.append(np.mean(episode_sharpes))
        all_drawdowns.append(np.mean(episode_drawdowns))
        
        print(f"Episode {episode+1}: Return = {np.mean(episode_returns):.4f}, Final Value = ${final_values[-1]:.2f}")
    
    avg_allocation = np.mean(monthly_allocations, axis=0)
    
    create_visualizations(
        avg_allocation, 
        all_returns, 
        all_sharpes, 
        all_drawdowns,
        final_values
    )
    
    return {
        'mean_return': np.mean(all_returns),
        'mean_sharpe': np.mean(all_sharpes),
        'mean_drawdown': np.mean(all_drawdowns),
        'mean_final_value': np.mean(final_values),
        'avg_allocation': avg_allocation
    }


def create_visualizations(avg_allocation, returns, sharpes, drawdowns, final_values):
    os.makedirs('results', exist_ok=True)
    
    plt.figure(figsize=(12, 6))
    bars = plt.bar(range(len(avg_allocation)), avg_allocation)
    
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                 f'{height:.1f}%',
                 ha='center', va='bottom', rotation=0)
    
    plt.xlabel('Stock')
    plt.ylabel('Average Allocation (%)')
    plt.title('Average Portfolio Allocation')
    plt.xticks(range(len(avg_allocation)), [f'Stock {i}' for i in range(len(avg_allocation))])
    plt.ylim(0, max(avg_allocation) * 1.2)
    plt.savefig('results/portfolio_allocation.png')
    
    plt.figure(figsize=(10, 6))
    plt.hist(returns, bins=10, alpha=0.7)
    plt.axvline(np.mean(returns), color='r', linestyle='dashed', linewidth=2)
    plt.text(np.mean(returns)*1.1, plt.ylim()[1]*0.9, f'Mean: {np.mean(returns):.4f}')
    plt.xlabel('Average Monthly Return')
    plt.ylabel('Frequency')
    plt.title('Distribution of Average Monthly Returns')
    plt.savefig('results/returns_distribution.png')
    
    plt.figure(figsize=(10, 6))
    plt.hist(final_values, bins=10, alpha=0.7)
    plt.axvline(np.mean(final_values), color='r', linestyle='dashed', linewidth=2)
    plt.text(np.mean(final_values)*1.02, plt.ylim()[1]*0.9, f'Mean: ${np.mean(final_values):.2f}')
    plt.xlabel('Final Portfolio Value ($)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Final Portfolio Values (12-month episodes)')
    plt.savefig('results/portfolio_values.png')
    
    plt.figure(figsize=(12, 6))
    metrics = ['Return (%)', 'Sharpe', 'Drawdown (%)']
    values = [np.mean(returns)*100, np.mean(sharpes), np.mean(drawdowns)*100]
    colors = ['green', 'blue', 'red']
    
    bars = plt.bar(metrics, values, color=colors)
    plt.title('Average Performance Metrics')
    
    for bar in bars:
        height = bar.get_height()
        sign = "+" if height > 0 else ""
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.1 if height > 0 else height - 0.6,
                 f'{sign}{height:.2f}',
                 ha='center', va='bottom' if height > 0 else 'top')
    
    plt.savefig('results/performance_metrics.png')
    
    print("Visualizations saved to 'results' directory")


def main():
    data_dir = input("Enter path to directory containing stock CSV files: ")
    
    if not os.path.exists(data_dir):
        print(f"Error: Directory {data_dir} does not exist")
        return
    
    save_dir = './models'
    os.makedirs(save_dir, exist_ok=True)
    
    temperature = float(input("Enter softmax temperature (0.1-1.0, lower = more concentrated, higher = more diverse): ") or "0.3")
    
    train_and_evaluate(
        data_dir=data_dir,
        save_dir=save_dir,
        total_timesteps=100000,
        eval_episodes=10,
        temperature=temperature
    )
    
    print("Training and evaluation complete!")


if __name__ == "__main__":
    main()