In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [24]:
def gbm_simulation_with_fixed_columns(file_path, date_column_name, price_column_name, num_scenarios=100, num_projection_days=30):

    try:
        # --- MODIFICATION: Read Excel file ---
        data_df = pd.read_excel(file_path)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        # Catch other potential errors from pd.read_excel, like needing an engine
        if "Excel file format cannot be determined" in str(e) or "No engine for file type" in str(e):
             print(f"Error reading Excel file: {e}. You might need to install an Excel engine like 'openpyxl'. Try: pip install openpyxl")
        else:
            print(f"Error reading Excel file: {e}")
        return None

    if data_df.empty:
        print("Error: Excel file is empty or the sheet is empty.")
        return None

    original_row_count = len(data_df)
    print(f"Successfully loaded '{file_path}' with {original_row_count} rows.")

    # --- Date Column Processing ---
    if date_column_name not in data_df.columns:
        print(f"Error: Specified date column '{date_column_name}' not found in the Excel file.")
        print(f"Available columns: {data_df.columns.tolist()}")
        return None
    
    print(f"Using specified date column: '{date_column_name}'")
    try:
        temp_date_col = data_df[date_column_name]
        # Convert to datetime objects, trying to infer format
        # For Excel, dates are often already datetime objects or easily convertible
        converted_dates = pd.to_datetime(temp_date_col, errors='coerce')
         # Attempt with dayfirst=True if many NaTs, common in some non-US Excel date formats
        if converted_dates.isna().sum() > len(temp_date_col.dropna()) * 0.5:
             converted_dates_dayfirst = pd.to_datetime(temp_date_col, errors='coerce', dayfirst=True)
             if converted_dates_dayfirst.isna().sum() < converted_dates.isna().sum():
                 converted_dates = converted_dates_dayfirst
                 print(f"Used dayfirst=True for date conversion on column '{date_column_name}'.")

        data_df[date_column_name] = converted_dates
        rows_before_dropna = len(data_df)
        data_df.dropna(subset=[date_column_name], inplace=True)
        rows_after_dropna = len(data_df)
        if rows_before_dropna > rows_after_dropna:
            print(f"Removed {rows_before_dropna - rows_after_dropna} rows due to failed date conversion in '{date_column_name}'.")

        if data_df.empty:
            print(f"Error: No valid dates found in column '{date_column_name}' after conversion.")
            return None
            
        data_df.sort_values(by=date_column_name, ascending=True, inplace=True)
        print(f"Data sorted by date column: '{date_column_name}' in ascending order.")
        if not data_df[date_column_name].empty:
            print(f"Date range after sorting: {data_df[date_column_name].min()} to {data_df[date_column_name].max()}")
        else:
            print(f"Date column '{date_column_name}' is empty after processing.")


    except Exception as e:
        print(f"Error processing date column '{date_column_name}': {e}.")
        return None

    # --- Price Column Processing ---
    if price_column_name not in data_df.columns:
        print(f"Error: Specified price column '{price_column_name}' not found in the Excel file.")
        print(f"Available columns: {data_df.columns.tolist()}")
        return None

    print(f"Using specified price column: '{price_column_name}'")
    try:
        # Ensure price column is numeric, handling potential strings with commas
        if data_df[price_column_name].dtype == 'object':
            price_series_str = data_df[price_column_name].astype(str)
            price_series_cleaned = price_series_str.str.replace(',', '', regex=False)
            data_df['cleaned_price'] = pd.to_numeric(price_series_cleaned, errors='coerce')
        else: # If already numeric or can be coerced
            data_df['cleaned_price'] = pd.to_numeric(data_df[price_column_name], errors='coerce')

        rows_before_price_dropna = len(data_df)
        data_df.dropna(subset=['cleaned_price'], inplace=True)
        rows_after_price_dropna = len(data_df)
        if rows_before_price_dropna > rows_after_price_dropna:
            print(f"Removed {rows_before_price_dropna - rows_after_price_dropna} rows due to failed price conversion/cleaning in '{price_column_name}'.")

    except Exception as e:
        print(f"Error cleaning/converting price column '{price_column_name}': {e}")
        return None

    if data_df.empty or len(data_df['cleaned_price']) < 2:
        print(f"Error: Not enough valid numeric data in price column '{price_column_name}' after cleaning (need at least 2 data points). Found: {len(data_df['cleaned_price'] if 'cleaned_price' in data_df else [])}")
        return None

    prices = data_df['cleaned_price']
    
    # --- Log Returns Calculation ---
    if len(prices) < 2:
        print("Error: Need at least two price points to calculate log returns.")
        return None
        
    log_returns = np.log(prices.iloc[1:].values / prices.iloc[:-1].values)
    log_returns = pd.Series(log_returns).dropna()

    if len(log_returns) < 1:
        print(f"Error: Not enough log returns to calculate mu and sigma (found {len(log_returns)}).")
        return None

    # --- GBM Parameters ---
    mu = log_returns.mean()
    sigma = log_returns.std()

    if pd.isna(mu) or pd.isna(sigma):
        print(f"Error: Calculated mu ({mu}) or sigma ({sigma}) is NaN.")
        return None
    
    if sigma == 0:
        print("Warning: Calculated daily volatility (σ) is 0. Predictions will be deterministic based on drift.")

    s_t = prices.iloc[-1] 
    dt = 1 
    
    # --- Simulation ---
    all_scenario_paths = [] 

    for scenario_index in range(num_scenarios):
        path_for_this_scenario = []
        current_price_in_path = s_t 

        for day_step in range(num_projection_days):
            z = np.random.normal(0, 1) 
            drift = (mu - 0.5 * sigma**2) * dt
            shock = sigma * np.sqrt(dt) * z
            next_price_in_path = current_price_in_path * np.exp(drift + shock)
            path_for_this_scenario.append(next_price_in_path)
            current_price_in_path = next_price_in_path 
        
        all_scenario_paths.append(path_for_this_scenario)

    day_columns = [f'Day_{d+1}_pred_price' for d in range(num_projection_days)]
    results_df = pd.DataFrame(all_scenario_paths, columns=day_columns)
    results_df.insert(0, 'Scenario No.', range(1, num_scenarios + 1))
    # --- End of Simulation section modification ---

    print(f"\n--- Simulation Parameters Used ---")
    print(f"Data source: '{file_path}' (initial rows: {original_row_count})")
    if date_column_name in data_df and not data_df[date_column_name].empty:
         min_date = data_df[date_column_name].min()
         max_date = data_df[date_column_name].max()
         if pd.notna(min_date) and pd.notna(max_date): # Check if dates are valid
             print(f"Data range used (after sorting by '{date_column_name}'): {min_date.strftime('%Y-%m-%d')} to {max_date.strftime('%Y-%m-%d')}")
         else:
             print(f"Could not determine valid date range for column '{date_column_name}'.")

    print(f"Number of price data points used (after cleaning/sorting): {len(prices)}")
    print(f"Last observed price (S_t from column '{price_column_name}'): {s_t:.4f}")
    print(f"Calculated daily drift (μ): {mu:.8f}")
    print(f"Calculated daily volatility (σ): {sigma:.8f}")
    print(f"Time step (Δt): {dt} day(s)")
    print(f"Number of scenarios: {num_scenarios}")
    print(f"Number of projection days per scenario: {num_projection_days}")
    
    return results_df

# --- How to use the function ---
file_path_for_simulation = "SET 50 Index Spot.xlsx" 

# --- User specified column names ---
date_col = 'Date'
price_col = 'SET50'

# --- Number of scenarios and projection days ---
scenarios_to_run = 100
days_to_project = 30

print(f"Starting GBM Simulation Script with fixed column names: Date='{date_col}', Price='{price_col}'")
print(f"Input file: {file_path_for_simulation}")
print(f"Number of scenarios: {scenarios_to_run}, Projection days: {days_to_project}")

simulation_results_data = gbm_simulation_with_fixed_columns(
    file_path_for_simulation,
    date_column_name=date_col,
    price_column_name=price_col,
    num_scenarios=scenarios_to_run,
    num_projection_days=days_to_project
)

if simulation_results_data is not None:
    print("\n--- Simulated Price Paths (First 10 scenarios, first 5 days) ---")
    # Displaying head for all day columns can be very wide, so select a few days if needed
    columns_to_show = ['Scenario No.'] + [f'Day_{d+1}_pred_price' for d in range(min(5, days_to_project))]
    print(simulation_results_data[columns_to_show].head(10).to_string()) 
    
    print(f"\n--- Summary Statistics for Projected Prices ---")
    # --- MODIFICATION: Describe only prediction columns ---
    prediction_price_columns = [col for col in simulation_results_data.columns if 'pred_price' in col]
    if prediction_price_columns: # Ensure there are prediction columns before describing
        print(simulation_results_data[prediction_price_columns].describe().to_string())
    else:
        print("No prediction columns found to describe.")
        
else:
    print("\nSimulation could not be completed. Please review the error messages above for details.")

print("\nGBM Simulation Script Finished.")
    
# # Save to Excel
# with pd.ExcelWriter('SET50_scenerio.xlsx', engine='openpyxl', mode='w') as writer:
#     simulation_results_data.to_excel(writer, sheet_name='Combined', index=False)

Starting GBM Simulation Script with fixed column names: Date='Date', Price='SET50'
Input file: SET 50 Index Spot.xlsx
Number of scenarios: 100, Projection days: 30
Successfully loaded 'SET 50 Index Spot.xlsx' with 980 rows.
Using specified date column: 'Date'
Data sorted by date column: 'Date' in ascending order.
Date range after sorting: 2021-01-04 00:00:04 to 2025-01-16 00:00:04
Using specified price column: 'SET50'

--- Simulation Parameters Used ---
Data source: 'SET 50 Index Spot.xlsx' (initial rows: 980)
Data range used (after sorting by 'Date'): 2021-01-04 to 2025-01-16
Number of price data points used (after cleaning/sorting): 980
Last observed price (S_t from column 'SET50'): 883.4000
Calculated daily drift (μ): -0.00004630
Calculated daily volatility (σ): 0.00782338
Time step (Δt): 1 day(s)
Number of scenarios: 100
Number of projection days per scenario: 30

--- Simulated Price Paths (First 10 scenarios, first 5 days) ---
   Scenario No.  Day_1_pred_price  Day_2_pred_price  D