In [None]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks

# --- Configuration ---
DATA_PATH = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_21_24.csv"  # Make sure this file is in the same directory or provide the full path
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date'
MIN_DATA_POINTS = 30 # Minimum data points required to train a model

# --- User Selections (Replace Streamlit Sidebar Inputs) ---
# Set these values manually for your desired forecast
SELECTED_STATE = "Maharashtra"  # Example: Choose a state from your data
SELECTED_DISTRICT = "Akola"     # Example: Choose a district
SELECTED_COMMODITY = "Wheat"    # Example: Choose a commodity
FORECAST_DAYS = 90             # Example: Forecast period

# --- Data Loading Function ---
def load_data(path):
    """Loads and preprocesses the data."""
    try:
        df = pd.read_csv(path)
        print(f"Successfully loaded data from {path}")

        # Basic Preprocessing
        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce')
        initial_rows = len(df)
        df.dropna(subset=[DATE_COLUMN], inplace=True) # Drop rows where date conversion failed
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to invalid dates.")

        # Ensure price columns are numeric, coerce errors to NaN
        for col in TARGET_COLUMNS:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Optional: Impute missing prices if needed (example using ffill per group)
        # print("Attempting forward fill for missing prices within groups...")
        # df.sort_values([DATE_COLUMN,'state_name', 'district_name', 'commodity_name'], inplace=True)
        # df[TARGET_COLUMNS] = df.groupby(['state_name', 'district_name', 'commodity_name'])[TARGET_COLUMNS].ffill()

        initial_rows = len(df)
        df.dropna(subset=TARGET_COLUMNS, inplace=True) # Drop rows with missing target values
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to missing price data after preprocessing.")

        df.sort_values(DATE_COLUMN, inplace=True)
        print(f"Data preprocessing complete. {len(df)} rows remaining.")
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing data: {e}")
        return None

# --- Modeling Function ---
def train_and_forecast(data, target_column, forecast_periods):
    """Trains a Prophet model and returns the model, forecast (starting from today),
       and predictions on the historical data for evaluation."""
    # Prepare data for Prophet (requires 'ds' and 'y' columns)
    prophet_df = data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    # Check for sufficient data points for training
    if len(prophet_df) < MIN_DATA_POINTS:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' in the selected group to train. Need at least {MIN_DATA_POINTS}. Skipping forecast.")
        return None, None, None # Return None for model, forecast, and historical predictions

    try:
        print(f"\nTraining Prophet model for '{target_column}'...")
        # Instantiate and fit Prophet model on historical data
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False, # Adjust based on expected patterns
            daily_seasonality=False   # Adjust based on expected patterns
        )
        model.fit(prophet_df) # Fit the model using historical data
        print("Model training complete.")

        # --- Create future dataframe STARTING FROM TODAY ---
        # Get today's date (midnight) based on system time when script runs
        current_date = pd.Timestamp.now().normalize()
        # Create a sequence of dates starting from today for the forecast period
        future_dates = pd.date_range(start=current_date, periods=forecast_periods, freq='D')
        future_df = pd.DataFrame({'ds': future_dates})
        # ----------------------------------------------------

        # Generate forecast using the dates starting from today
        print(f"Generating {forecast_periods}-day forecast starting from {current_date.strftime('%Y-%m-%d')}...")
        forecast = model.predict(future_df)
        print("Forecast generation complete.")

        # Generate predictions on historical data for evaluation
        historical_preds = model.predict(prophet_df) # Predict on the same data used for training

        return model, forecast, historical_preds # Return model, future forecast, historical predictions

    except Exception as e:
        print(f"Error during Prophet modeling or forecasting for {target_column}: {e}")
        return None, None, None

# --- Plotting Function for a Single Target ---
def plot_single_forecast(historical_data, forecast_data, target_column, title):
    """Creates a Plotly figure for one target's historical data and forecast."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize() # Clean label for display

    # Add historical data trace
    hist_data_col = historical_data[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=hist_data_col[DATE_COLUMN],
        y=hist_data_col[target_column],
        mode='lines',
        name=f'Historical {target_label}',
        line=dict(color='blue')
    ))

    # Add forecast trace (starts from today)
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat'],
        mode='lines',
        name=f'Forecast {target_label}',
        line=dict(color='red', dash='dash')
    ))

    # Add uncertainty interval for the forecast
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_upper'],
        mode='lines', name='Forecast Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_lower'],
        mode='lines', name='Forecast Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)', # Light red fill for uncertainty
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig

# --- Evaluation Metrics Function ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    return r2, mae, mse

# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting ---")
print(f"Forecasting from current date: {pd.Timestamp.now().normalize().strftime('%Y-%m-%d')}")

# Load data
df_full = load_data(DATA_PATH)

if df_full is not None:
    # --- Filtering Data Based on User Selections ---
    print(f"\nFiltering data for State='{SELECTED_STATE}', District='{SELECTED_DISTRICT}', Commodity='{SELECTED_COMMODITY}'...")
    filtered_df = df_full[
        (df_full['state_name'].str.strip().str.lower() == SELECTED_STATE.strip().lower()) &
        (df_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT.strip().lower()) &
        (df_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY.strip().lower())
    ].copy() # Use copy to avoid SettingWithCopyWarning

    # Ensure data is sorted by date (important for plotting historical correctly)
    filtered_df.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df.empty:
        print("\nWarning: No historical data found for the selected combination.")
        print("Please check the CSV file and your selections (State, District, Commodity).")
    else:
        last_hist_date = filtered_df[DATE_COLUMN].max().strftime('%Y-%m-%d')
        print(f"\nFound {len(filtered_df)} historical data points (Latest: {last_hist_date}).")
        print(f"Proceeding with forecast for {FORECAST_DAYS} days.")

        all_forecasts = {} # Dictionary to store forecasts if needed later

        # Loop through each target price type
        for target in TARGET_COLUMNS:
            print("-" * 50)
            print(f"Processing Target: {target}")

            # Check if target column exists and has data after filtering
            if target not in filtered_df.columns or filtered_df[target].isnull().all():
                print(f"Warning: Target column '{target}' not found or contains only null values for the selection. Skipping.")
                continue

            # Prepare data for this specific target (dropping NaNs for this target)
            target_df = filtered_df[[DATE_COLUMN, target]].dropna().copy()
            if target_df.empty:
                 print(f"Warning: No valid data points for '{target}' after dropping NaNs. Skipping.")
                 continue

            # Train model, get forecast (future dates), and get historical predictions
            model, forecast, historical_preds = train_and_forecast(target_df, target, FORECAST_DAYS)

            if forecast is not None and historical_preds is not None:
                all_forecasts[target] = forecast # Store the forecast

                # --- Evaluate Model Fit on Historical Data ---
                print(f"\n--- Evaluating Model Fit for {target} (on historical data) ---")
                actuals = target_df[target] # Ground truth from the training data
                preds = historical_preds['yhat'] # Predictions on the training data
                
                # Ensure alignment - Prophet predictions match the input df length/order
                if len(actuals) == len(preds):
                    r2, mae, mse = calculate_metrics(actuals, preds)
                    print(f"R-squared (R2): {r2:.4f}")
                    print(f"Mean Absolute Error (MAE): {mae:.2f}")
                    print(f"Mean Squared Error (MSE): {mse:.2f}")
                else:
                    print("Warning: Mismatch between actuals and predictions length. Cannot calculate metrics accurately.")
                    print(f"Actuals length: {len(actuals)}, Predictions length: {len(preds)}")


                # --- Plot Historical Data and Forecast ---
                print(f"\n--- Plotting Historical Data & Forecast for {target} ---")
                plot_title = f'{target.replace("avg_", "").replace("_price", "").capitalize()} Price: Historical & {FORECAST_DAYS}-Day Forecast\n({SELECTED_COMMODITY} in {SELECTED_DISTRICT}, {SELECTED_STATE})'
                fig = plot_single_forecast(target_df, forecast, target, plot_title)
                fig.show() # Display the plot in the notebook output

                # --- Display Forecast Data Table ---
                print(f"\n--- Forecast Data Table for {target} ({FORECAST_DAYS} days) ---")
                f_display = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
                f_display.columns = ['Date', 'Forecast', 'Lower Bound', 'Upper Bound']
                f_display['Date'] = f_display['Date'].dt.strftime('%Y-%m-%d') # Format date
                # Use display for potentially nicer formatting in Jupyter, or just print
                display(f_display.set_index('Date').style.format("{:.2f}"))
                # Alternatively: print(f_display.set_index('Date').round(2).to_string())

            else:
                # Message already printed in train_and_forecast if skipped
                print(f"Skipping results display for {target} due to insufficient data or error during modeling.")

        print("-" * 50)
        print("\nForecasting process finished.")

else:
    print("\nFailed to load data. Cannot run the forecasting process.")

In [None]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks

# --- Configuration ---
DATA_PATH = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_21_24.csv" # Use raw string for Windows paths
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date'
MIN_DATA_POINTS = 30 # Minimum data points required to train a model

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"
SELECTED_DISTRICT_STR = "Akola"
SELECTED_COMMODITY_STR = "Wheat"
FORECAST_DAYS = 90

# --- Frequency Encoding Mappings (CRITICAL: REPLACE WITH YOUR ACTUAL MAPPINGS) ---
# These dictionaries MUST map the original string names to the EXACT numerical
# values present in your 'edited_21_24.csv' file for these columns.
# Example placeholder - replace with your real mappings:
state_name_encoding_map = {
    "maharashtra": 1500, # Replace 1500 with the actual encoded value for Maharashtra
    "gujarat": 1200,
    # ... add all other states and their encoded values ...
}

district_name_encoding_map = {
    "akola": 50,    # Replace 50 with the actual encoded value for Akola
    "pune": 80,
    # ... add all other districts and their encoded values ...
}

commodity_name_encoding_map = {
    "wheat": 250,   # Replace 250 with the actual encoded value for Wheat
    "rice": 300,
    # ... add all other commodities and their encoded values ...
}
# --- End Mappings ---


# --- Data Loading Function (No changes needed here) ---
def load_data(path):
    """Loads and preprocesses the data."""
    try:
        df = pd.read_csv(path)
        print(f"Successfully loaded data from {path}")

        # Basic Preprocessing
        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce')
        initial_rows = len(df)
        df.dropna(subset=[DATE_COLUMN], inplace=True)
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to invalid dates.")

        # Ensure price columns are numeric, coerce errors to NaN
        for col in TARGET_COLUMNS:
            # Check if column exists before trying to convert
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Target column '{col}' not found in the loaded CSV.")


        # --- Important: Assume state/district/commodity columns are already numeric ---
        # We no longer need to convert them, but we should check they exist
        filter_cols = ['state_name', 'district_name', 'commodity_name']
        for col in filter_cols:
            if col not in df.columns:
                 print(f"Error: Filtering column '{col}' not found in the loaded CSV. Cannot proceed.")
                 return None
             # Optional: Check if they are indeed numeric
             # if not pd.api.types.is_numeric_dtype(df[col]):
             #      print(f"Warning: Column '{col}' is expected to be numeric due to encoding, but it's not.")


        initial_rows = len(df)
        # Drop rows where ANY target column is missing
        df.dropna(subset=TARGET_COLUMNS, inplace=True, how='any')
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to missing price data in target columns.")

        df.sort_values(DATE_COLUMN, inplace=True)
        print(f"Data preprocessing complete. {len(df)} rows remaining.")
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing data: {e}")
        return None

# --- Modeling Function (No changes needed here) ---
def train_and_forecast(data, target_column, forecast_periods):
    """Trains a Prophet model and returns the model, forecast (starting from today),
       and predictions on the historical data for evaluation."""
    # Prepare data for Prophet (requires 'ds' and 'y' columns)
    prophet_df = data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    # Check for sufficient data points for training
    if len(prophet_df) < MIN_DATA_POINTS:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' in the selected group to train. Need at least {MIN_DATA_POINTS}. Skipping forecast.")
        return None, None, None # Return None for model, forecast, and historical predictions

    try:
        print(f"\nTraining Prophet model for '{target_column}'...")
        # Instantiate and fit Prophet model on historical data
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False, # Adjust based on expected patterns
            daily_seasonality=False   # Adjust based on expected patterns
        )
        model.fit(prophet_df) # Fit the model using historical data
        print("Model training complete.")

        # --- Create future dataframe STARTING FROM TODAY ---
        current_date = pd.Timestamp.now().normalize()
        future_dates = pd.date_range(start=current_date, periods=forecast_periods, freq='D')
        future_df = pd.DataFrame({'ds': future_dates})
        # ----------------------------------------------------

        # Generate forecast using the dates starting from today
        print(f"Generating {forecast_periods}-day forecast starting from {current_date.strftime('%Y-%m-%d')}...")
        forecast = model.predict(future_df)
        print("Forecast generation complete.")

        # Generate predictions on historical data for evaluation
        historical_preds = model.predict(prophet_df) # Predict on the same data used for training

        return model, forecast, historical_preds # Return model, future forecast, historical predictions

    except Exception as e:
        print(f"Error during Prophet modeling or forecasting for {target_column}: {e}")
        return None, None, None

# --- Plotting Function (No changes needed here) ---
def plot_single_forecast(historical_data, forecast_data, target_column, title):
    """Creates a Plotly figure for one target's historical data and forecast."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize() # Clean label for display

    # Add historical data trace
    hist_data_col = historical_data[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=hist_data_col[DATE_COLUMN],
        y=hist_data_col[target_column],
        mode='lines',
        name=f'Historical {target_label}',
        line=dict(color='blue')
    ))

    # Add forecast trace (starts from today)
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat'],
        mode='lines',
        name=f'Forecast {target_label}',
        line=dict(color='red', dash='dash')
    ))

    # Add uncertainty interval for the forecast
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_upper'],
        mode='lines', name='Forecast Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_lower'],
        mode='lines', name='Forecast Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)', # Light red fill for uncertainty
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig

# --- Evaluation Metrics Function (No changes needed here) ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    return r2, mae, mse

# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting ---")
print(f"Forecasting from current date: {pd.Timestamp.now().normalize().strftime('%Y-%m-%d')}")

# Load data
df_full = load_data(DATA_PATH)

if df_full is not None:
    # --- Get Encoded Values for Filtering ---
    try:
        # Convert selected strings to lowercase to match typical map keys
        selected_state_key = SELECTED_STATE_STR.strip().lower()
        selected_district_key = SELECTED_DISTRICT_STR.strip().lower()
        selected_commodity_key = SELECTED_COMMODITY_STR.strip().lower()

        encoded_state = state_name_encoding_map.get(selected_state_key)
        encoded_district = district_name_encoding_map.get(selected_district_key)
        encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)

        # Check if lookup was successful
        if encoded_state is None:
            raise ValueError(f"State '{SELECTED_STATE_STR}' not found in state_name_encoding_map.")
        if encoded_district is None:
            raise ValueError(f"District '{SELECTED_DISTRICT_STR}' not found in district_name_encoding_map.")
        if encoded_commodity is None:
            raise ValueError(f"Commodity '{SELECTED_COMMODITY_STR}' not found in commodity_name_encoding_map.")

        print(f"\nSelected criteria (string): State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}'")
        print(f"Corresponding encoded values: State={encoded_state}, District={encoded_district}, Commodity={encoded_commodity}")

    except Exception as e:
        print(f"Error during mapping lookup: {e}")
        print("Please ensure the encoding maps are correct and contain the selected items.")
        df_full = None # Prevent further processing

if df_full is not None:
    # --- Filtering Data Based on Encoded Values ---
    print(f"\nFiltering data using encoded values...")
    filtered_df = df_full[
        (df_full['state_name'] == encoded_state) &
        (df_full['district_name'] == encoded_district) &
        (df_full['commodity_name'] == encoded_commodity)
    ].copy() # Use copy to avoid SettingWithCopyWarning

    # Ensure data is sorted by date (important for plotting historical correctly)
    filtered_df.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df.empty:
        print("\nWarning: No historical data found for the selected combination using encoded values.")
        print("Please check the CSV file, your selections, and the encoding maps.")
    else:
        last_hist_date = filtered_df[DATE_COLUMN].max().strftime('%Y-%m-%d')
        print(f"\nFound {len(filtered_df)} historical data points (Latest: {last_hist_date}).")
        print(f"Proceeding with forecast for {FORECAST_DAYS} days.")

        all_forecasts = {} # Dictionary to store forecasts if needed later

        # Loop through each target price type
        for target in TARGET_COLUMNS:
             # First check if target column exists in the filtered data at all
             if target not in filtered_df.columns:
                 print(f"Target column '{target}' not present in the filtered data. Skipping.")
                 continue

             print("-" * 50)
             print(f"Processing Target: {target}")


             # Prepare data for this specific target (dropping NaNs for this target)
             # Ensure we only select existing columns DATE_COLUMN and target
             cols_to_select = [DATE_COLUMN, target]
             target_df = filtered_df[cols_to_select].dropna().copy() # Drop rows missing the target value

             if target_df.empty:
                 print(f"Warning: No valid data points for '{target}' after dropping NaNs for this specific target. Skipping.")
                 continue

             # Train model, get forecast (future dates), and get historical predictions
             model, forecast, historical_preds = train_and_forecast(target_df, target, FORECAST_DAYS)

             if forecast is not None and historical_preds is not None:
                 all_forecasts[target] = forecast # Store the forecast

                 # --- Evaluate Model Fit on Historical Data ---
                 print(f"\n--- Evaluating Model Fit for {target} (on historical data) ---")
                 actuals = target_df['y'].values # Get actual values from the prophet-formatted df
                 preds = historical_preds['yhat'].values # Get predictions

                 if len(actuals) == len(preds):
                     r2, mae, mse = calculate_metrics(actuals, preds)
                     print(f"R-squared (R2): {r2:.4f}")
                     print(f"Mean Absolute Error (MAE): {mae:.2f}")
                     print(f"Mean Squared Error (MSE): {mse:.2f}")
                 else:
                     print("Warning: Mismatch between actuals and predictions length. Cannot calculate metrics accurately.")
                     print(f"Actuals length: {len(actuals)}, Predictions length: {len(preds)}")


                 # --- Plot Historical Data and Forecast ---
                 print(f"\n--- Plotting Historical Data & Forecast for {target} ---")
                 plot_title = f'{target.replace("avg_", "").replace("_price", "").capitalize()} Price: Historical & {FORECAST_DAYS}-Day Forecast\n({SELECTED_COMMODITY_STR} in {SELECTED_DISTRICT_STR}, {SELECTED_STATE_STR})'
                 # Pass target_df which contains only date and the current target value
                 fig = plot_single_forecast(target_df, forecast, target, plot_title)
                 fig.show() # Display the plot in the notebook output

                 # --- Display Forecast Data Table ---
                 print(f"\n--- Forecast Data Table for {target} ({FORECAST_DAYS} days) ---")
                 f_display = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
                 f_display.columns = ['Date', 'Forecast', 'Lower Bound', 'Upper Bound']
                 f_display['Date'] = f_display['Date'].dt.strftime('%Y-%m-%d') # Format date
                 display(f_display.set_index('Date').style.format("{:.2f}"))

             else:
                 print(f"Skipping results display for {target} due to insufficient data or error during modeling.")

        print("-" * 50)
        print("\nForecasting process finished.")

else:
    print("\nFailed to load data or error during mapping lookup. Cannot run the forecasting process.")

In [5]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks
import numpy as np # Needed for IQR outlier check

# --- Configuration ---
# Use the RAW data path now
DATA_PATH = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\wheat_price(2021-24).csv"
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date' # Ensure this matches the column name in your raw CSV
MIN_DATA_POINTS = 30 # Minimum data points required to train a model

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"  # Example: Choose a state from your data
SELECTED_DISTRICT_STR = "Akola"     # Example: Choose a district
SELECTED_COMMODITY_STR = "Wheat"    # Example: Choose a commodity
FORECAST_DAYS = 180            # Example: Forecast period

# --- Outlier Removal Function ---
def remove_outliers_iqr(df, columns_to_check):
    """Removes outliers from specified numerical columns using the IQR method."""
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")

    # Ensure columns exist and are numeric before calculating IQR
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns:
        print("Warning: No valid numeric columns found for IQR outlier removal.")
        return df_filtered

    print(f"Valid numeric columns for IQR: {valid_columns}")
    subset_for_iqr = df_filtered[valid_columns]

    Q1 = subset_for_iqr.quantile(0.25)
    Q3 = subset_for_iqr.quantile(0.75)
    IQR = Q3 - Q1

    # Create a boolean mask for rows that are NOT outliers in ANY of the checked columns
    # Note: '~' negates the condition, keeping rows that are *within* the bounds.
    # .any(axis=1) flags a row if it's an outlier in *at least one* column.
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)

    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered


# --- Data Loading and Preprocessing Function (Incorporating User Steps) ---
def load_and_preprocess_data(path, date_col, target_cols):
    """Loads raw data and applies user-defined preprocessing steps."""
    try:
        print(f"Loading raw data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")

        # --- User Preprocessing Steps START ---

        # 1. Handle Date Column (CRITICAL: Keep original date for Prophet)
        if date_col not in df.columns:
            print(f"Error: Date column '{date_col}' not found in CSV.")
            return None
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df.dropna(subset=[date_col], inplace=True) # Drop rows where date conversion failed
        print(f"{len(df)} rows after date parsing and dropping invalid dates.")

        # Optional: Extract day/month/year if needed for other analysis (but keep original date)
        # df['day'] = df[date_col].dt.day
        # df['month'] = df[date_col].dt.month
        # df['year'] = df[date_col].dt.year

        # 2. Initial dropna() as requested by user
        initial_rows = len(df)
        df.dropna(inplace=True)
        print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} rows removed.")
        # Note: User's fillna('interpolate') was syntactically incorrect and likely redundant after dropna. Skipping.

        # 3. Ensure Price columns are numeric (Do this AFTER dropna potentially removes non-numeric entries)
        for col in target_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Target column '{col}' not found.")
        # Drop rows where target columns became NaN after conversion (should be few if dropna worked)
        df.dropna(subset=target_cols, inplace=True, how='any')
        print(f"{len(df)} rows after ensuring target columns are numeric.")


        # 4. Frequency Encoding - SKIPPED HERE. Filtering needs original strings.
        #    If encoding is needed for a different model later, apply it AFTER filtering.

        # 5. Drop specified unused columns
        cols_to_drop = ['calculationType', 'district_id', 'change']
        # Check if columns exist before dropping
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            print(f"Dropped columns: {existing_cols_to_drop}")
        else:
             print(f"Columns to drop ({cols_to_drop}) not found.")


        # 6. Apply IQR Outlier Removal (Targeted at price columns)
        df = remove_outliers_iqr(df, target_cols)

        # --- User Preprocessing Steps END ---

        # Final check for required columns
        required_cols = [date_col] + target_cols + ['state_name', 'district_name', 'commodity_name']
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing after preprocessing: {missing_req_cols}")
            return None

        df.sort_values(date_col, inplace=True)
        print(f"Preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError:
        print(f"Error: Data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing data: {e}")
        # print traceback for detailed debugging if needed
        # import traceback
        # traceback.print_exc()
        return None

# --- Modeling Function (No changes needed) ---
def train_and_forecast(data, target_column, forecast_periods):
    """Trains a Prophet model and returns the model, forecast (starting from today),
       and predictions on the historical data for evaluation."""
    # Prepare data for Prophet (requires 'ds' and 'y' columns)
    prophet_df = data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    # Check for sufficient data points for training
    if len(prophet_df) < MIN_DATA_POINTS:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' in the selected group to train. Need at least {MIN_DATA_POINTS}. Skipping forecast.")
        return None, None, None # Return None for model, forecast, and historical predictions

    try:
        print(f"\nTraining Prophet model for '{target_column}'...")
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )
        model.fit(prophet_df)
        print("Model training complete.")

        current_date = pd.Timestamp.now().normalize()
        future_dates = pd.date_range(start=current_date, periods=forecast_periods, freq='D')
        future_df = pd.DataFrame({'ds': future_dates})

        print(f"Generating {forecast_periods}-day forecast starting from {current_date.strftime('%Y-%m-%d')}...")
        forecast = model.predict(future_df)
        print("Forecast generation complete.")

        historical_preds = model.predict(prophet_df)

        return model, forecast, historical_preds

    except Exception as e:
        print(f"Error during Prophet modeling or forecasting for {target_column}: {e}")
        return None, None, None

# --- Plotting Function (No changes needed) ---
def plot_single_forecast(historical_data, forecast_data, target_column, title):
    """Creates a Plotly figure for one target's historical data and forecast."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()

    # Add historical data trace (use DATE_COLUMN here)
    hist_data_col = historical_data[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=hist_data_col[DATE_COLUMN],
        y=hist_data_col[target_column],
        mode='lines',
        name=f'Historical {target_label}',
        line=dict(color='blue')
    ))

    # Add forecast trace (uses 'ds' from forecast data)
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat'],
        mode='lines',
        name=f'Forecast {target_label}',
        line=dict(color='red', dash='dash')
    ))

    # Add uncertainty interval for the forecast
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_upper'],
        mode='lines', name='Forecast Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_lower'],
        mode='lines', name='Forecast Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)',
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig

# --- Evaluation Metrics Function (No changes needed) ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    return r2, mae, mse

# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting ---")
print(f"Forecasting from current date: {pd.Timestamp.now().normalize().strftime('%Y-%m-%d')}")

# Load data using the new preprocessing function
df_full = load_and_preprocess_data(DATA_PATH, DATE_COLUMN, TARGET_COLUMNS)

if df_full is not None:
    # --- Filtering Data Based on User Selections (using original string columns) ---
    print(f"\nFiltering data for State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}'...")

    # Check if filter columns exist before attempting to filter
    filter_cols = ['state_name', 'district_name', 'commodity_name']
    missing_filter_cols = [col for col in filter_cols if col not in df_full.columns]
    if missing_filter_cols:
        print(f"Error: Cannot filter because columns are missing: {missing_filter_cols}")
        df_full = None # Stop processing
    else:
        # Apply the filter using string methods (now valid)
        filtered_df = df_full[
            (df_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy() # Use copy to avoid SettingWithCopyWarning

        # Ensure data is sorted by date after filtering
        filtered_df.sort_values(by=DATE_COLUMN, inplace=True)

# Proceed only if filtering was possible and resulted in data
if df_full is not None and 'filtered_df' in locals():

    if filtered_df.empty:
        print("\nWarning: No historical data found for the selected combination after preprocessing and filtering.")
        print("Please check the raw CSV file and your selections.")
    else:
        last_hist_date = filtered_df[DATE_COLUMN].max().strftime('%Y-%m-%d')
        print(f"\nFound {len(filtered_df)} historical data points for selection (Latest: {last_hist_date}).")
        print(f"Proceeding with forecast for {FORECAST_DAYS} days.")

        all_forecasts = {} # Dictionary to store forecasts if needed later

        # Loop through each target price type
        for target in TARGET_COLUMNS:
            if target not in filtered_df.columns:
                print(f"\nTarget column '{target}' not present in the filtered data. Skipping.")
                continue

            print("-" * 50)
            print(f"Processing Target: {target}")

            # Prepare data specific to this target (dropping NaNs ONLY for this target in the filtered data)
            target_df = filtered_df[[DATE_COLUMN, target]].dropna().copy()

            if target_df.empty:
                print(f"Warning: No valid data points remain for '{target}' after dropping NaNs within the filtered selection. Skipping.")
                continue

            # Train model, get forecast (future dates), and get historical predictions
            # Pass target_df which contains only date and the current target price
            model, forecast, historical_preds = train_and_forecast(target_df, target, FORECAST_DAYS)

            if forecast is not None and historical_preds is not None:
                all_forecasts[target] = forecast

                # --- Evaluate Model Fit on Historical Data ---
                print(f"\n--- Evaluating Model Fit for {target} (on historical data) ---")
                # Actual values are in the 'y' column of the data passed to Prophet
                prophet_input_df = target_df.rename(columns={DATE_COLUMN: 'ds', target: 'y'})
                actuals = prophet_input_df['y'].values
                preds = historical_preds['yhat'].values

                if len(actuals) == len(preds):
                    r2, mae, mse = calculate_metrics(actuals, preds)
                    print(f"R-squared (R2): {r2:.4f}")
                    print(f"Mean Absolute Error (MAE): {mae:.2f}")
                    print(f"Mean Squared Error (MSE): {mse:.2f}")
                else:
                    print("Warning: Mismatch between actuals and predictions length. Cannot calculate metrics accurately.")
                    print(f"Actuals length: {len(actuals)}, Predictions length: {len(preds)}")


                # --- Plot Historical Data and Forecast ---
                print(f"\n--- Plotting Historical Data & Forecast for {target} ---")
                plot_title = f'{target.replace("avg_", "").replace("_price", "").capitalize()} Price: Historical & {FORECAST_DAYS}-Day Forecast\n({SELECTED_COMMODITY_STR} in {SELECTED_DISTRICT_STR}, {SELECTED_STATE_STR})'
                # Pass target_df which contains the original date and target columns
                fig = plot_single_forecast(target_df, forecast, target, plot_title)
                fig.show()

                # --- Display Forecast Data Table ---
                print(f"\n--- Forecast Data Table for {target} ({FORECAST_DAYS} days) ---")
                f_display = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
                f_display.columns = ['Date', 'Forecast', 'Lower Bound', 'Upper Bound']
                f_display['Date'] = f_display['Date'].dt.strftime('%Y-%m-%d')
                display(f_display.set_index('Date').style.format("{:.2f}"))

            else:
                print(f"Skipping results display for {target} due to insufficient data or error during modeling.")

        print("-" * 50)
        print("\nForecasting process finished.")

elif df_full is None:
    print("\nFailed to load or preprocess data. Cannot run the forecasting process.")
# No 'else' needed here, covered by the check for filtered_df being empty or not existing

--- Crop Price Time Series Forecasting ---
Forecasting from current date: 2025-04-06
Loading raw data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\wheat_price(2021-24).csv...
Successfully loaded 29201 rows.
29201 rows after date parsing and dropping invalid dates.
29170 rows after initial broad dropna(). 31 rows removed.
29170 rows after ensuring target columns are numeric.
Dropped columns: ['calculationType', 'district_id', 'change']
Applying IQR Outlier Removal on columns: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Valid numeric columns for IQR: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Removed 1179 rows identified as outliers based on IQR.
Preprocessing complete. 27991 rows remaining.

Filtering data for State='Maharashtra', District='Akola', Commodity='Wheat'...

Found 1231 historical data points for selection (Latest: 2024-12-31).
Proceeding with forecast for 180 days.
--------------------------------------------------
Processing Tar

19:43:53 - cmdstanpy - INFO - Chain [1] start processing
19:43:53 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.
Generating 180-day forecast starting from 2025-04-06...
Forecast generation complete.

--- Evaluating Model Fit for avg_min_price (on historical data) ---
R-squared (R2): 0.7481
Mean Absolute Error (MAE): 136.20
Mean Squared Error (MSE): 31542.04

--- Plotting Historical Data & Forecast for avg_min_price ---



--- Forecast Data Table for avg_min_price (180 days) ---


Unnamed: 0_level_0,Forecast,Lower Bound,Upper Bound
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-06,2439.73,2214.24,2674.72
2025-04-07,2444.81,2222.96,2675.31
2025-04-08,2449.78,2227.95,2677.07
2025-04-09,2454.54,2236.56,2679.52
2025-04-10,2459.04,2238.85,2687.21
2025-04-11,2463.18,2233.14,2685.57
2025-04-12,2466.89,2242.12,2692.53
2025-04-13,2470.11,2225.21,2690.89
2025-04-14,2472.76,2245.62,2684.37
2025-04-15,2474.8,2251.45,2694.96


19:43:54 - cmdstanpy - INFO - Chain [1] start processing


--------------------------------------------------
Processing Target: avg_max_price

Training Prophet model for 'avg_max_price'...


19:43:54 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.
Generating 180-day forecast starting from 2025-04-06...
Forecast generation complete.

--- Evaluating Model Fit for avg_max_price (on historical data) ---
R-squared (R2): 0.7772
Mean Absolute Error (MAE): 155.90
Mean Squared Error (MSE): 39928.97

--- Plotting Historical Data & Forecast for avg_max_price ---



--- Forecast Data Table for avg_max_price (180 days) ---


Unnamed: 0_level_0,Forecast,Lower Bound,Upper Bound
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-06,3234.39,2985.85,3486.84
2025-04-07,3240.14,2975.21,3493.34
2025-04-08,3245.05,2963.97,3479.27
2025-04-09,3249.08,2996.13,3495.01
2025-04-10,3252.2,2991.88,3520.13
2025-04-11,3254.39,2986.51,3502.72
2025-04-12,3255.64,3007.72,3511.4
2025-04-13,3255.95,3011.53,3507.36
2025-04-14,3255.34,2997.47,3492.68
2025-04-15,3253.84,2996.91,3516.04


19:43:54 - cmdstanpy - INFO - Chain [1] start processing


--------------------------------------------------
Processing Target: avg_modal_price

Training Prophet model for 'avg_modal_price'...


19:43:54 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.
Generating 180-day forecast starting from 2025-04-06...
Forecast generation complete.

--- Evaluating Model Fit for avg_modal_price (on historical data) ---
R-squared (R2): 0.7801
Mean Absolute Error (MAE): 141.42
Mean Squared Error (MSE): 33848.73

--- Plotting Historical Data & Forecast for avg_modal_price ---



--- Forecast Data Table for avg_modal_price (180 days) ---


Unnamed: 0_level_0,Forecast,Lower Bound,Upper Bound
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-04-06,2926.23,2690.74,3162.98
2025-04-07,2932.28,2699.01,3158.35
2025-04-08,2937.64,2708.59,3167.44
2025-04-09,2942.26,2697.38,3184.89
2025-04-10,2946.1,2726.04,3186.52
2025-04-11,2949.13,2722.79,3188.89
2025-04-12,2951.32,2710.14,3192.56
2025-04-13,2952.67,2697.24,3183.37
2025-04-14,2953.2,2701.58,3176.76
2025-04-15,2952.91,2728.9,3189.1


--------------------------------------------------

Forecasting process finished.


validation on 2025 dataset

In [1]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks
import numpy as np # Needed for IQR outlier check

# --- Configuration ---
# Training Data (Historical)
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\wheat_price(2021-24).csv"
# Validation Data (Future/Test)
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\wheat_price(2025).csv" # <--- !!! UPDATE THIS PATH !!!

TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date' # Ensure this matches the column name in your raw CSVs
MIN_DATA_POINTS_TRAIN = 30 # Minimum data points required to train a model

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"  # Example: Choose a state from your data
SELECTED_DISTRICT_STR = "Akola"     # Example: Choose a district
SELECTED_COMMODITY_STR = "Wheat"    # Example: Choose a commodity
# FORECAST_DAYS is no longer directly used for plotting, but prediction happens on validation dates

# --- Outlier Removal Function (No changes needed) ---
def remove_outliers_iqr(df, columns_to_check):
    """Removes outliers from specified numerical columns using the IQR method."""
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns:
        print("Warning: No valid numeric columns found for IQR outlier removal.")
        return df_filtered
    print(f"Valid numeric columns for IQR: {valid_columns}")
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25)
    Q3 = subset_for_iqr.quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (Revised slightly for clarity) ---
def load_and_preprocess_data(path, date_col, target_cols, dataset_name="Training"):
    """Loads raw data and applies user-defined preprocessing steps."""
    print("-" * 30)
    print(f"Processing {dataset_name} Dataset")
    print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")

        # 1. Handle Date Column
        if date_col not in df.columns:
            print(f"Error: Date column '{date_col}' not found in {dataset_name} CSV.")
            return None
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        df.dropna(subset=[date_col], inplace=True)
        print(f"{len(df)} rows after date parsing and dropping invalid dates.")

        # 2. Initial dropna()
        initial_rows = len(df)
        df.dropna(inplace=True)
        print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} rows removed.")

        # 3. Ensure Price columns are numeric
        for col in target_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Target column '{col}' not found in {dataset_name} data.")
        df.dropna(subset=target_cols, inplace=True, how='any')
        print(f"{len(df)} rows after ensuring target columns are numeric.")

        # 4. Drop specified unused columns
        cols_to_drop = ['calculationType', 'district_id', 'change']
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            print(f"Dropped columns: {existing_cols_to_drop}")

        # 5. Apply IQR Outlier Removal (Targeted at price columns)
        # Optional: you might choose *not* to remove outliers from validation set
        # to test robustness, but keeping it consistent for now.
        df = remove_outliers_iqr(df, target_cols)

        # Final check for required columns
        required_cols = [date_col] + target_cols + ['state_name', 'district_name', 'commodity_name']
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing in {dataset_name} data after preprocessing: {missing_req_cols}")
            return None

        df.sort_values(date_col, inplace=True)
        print(f"{dataset_name} data preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError:
        print(f"Error: {dataset_name} data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing {dataset_name} data: {e}")
        return None

# --- NEW Training Function ---
def train_prophet_model(training_data, target_column):
    """Trains a Prophet model on historical data and returns the fitted model."""
    print(f"\nTraining Prophet model for '{target_column}' using historical data...")
    # Prepare data for Prophet (requires 'ds' and 'y' columns)
    prophet_df = training_data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    # Check for sufficient data points for training
    if len(prophet_df) < MIN_DATA_POINTS_TRAIN:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' to train. Need at least {MIN_DATA_POINTS_TRAIN}. Skipping.")
        return None

    try:
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False, # Adjust as needed
            daily_seasonality=False  # Adjust as needed
        )
        model.fit(prophet_df)
        print("Model training complete.")
        return model
    except Exception as e:
        print(f"Error during Prophet model training for {target_column}: {e}")
        return None

# --- Evaluation Metrics Function (No changes needed) ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    # Ensure inputs are numpy arrays and not empty
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if len(y_true) == 0 or len(y_pred) == 0:
        print("Warning: Empty arrays passed to calculate_metrics.")
        return np.nan, np.nan, np.nan
    if len(y_true) != len(y_pred):
        print(f"Warning: Length mismatch in calculate_metrics. True: {len(y_true)}, Pred: {len(y_pred)}")
        # Attempt calculation if lengths are close, otherwise return NaN
        min_len = min(len(y_true), len(y_pred))
        if min_len == 0 : return np.nan, np.nan, np.nan
        y_true = y_true[:min_len]
        y_pred = y_pred[:min_len]

    try:
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return np.nan, np.nan, np.nan


# --- NEW Plotting Function for Validation ---
def plot_validation_results(validation_actuals, validation_predictions, target_column, title):
    """Creates a Plotly figure comparing actual validation data vs predictions."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()

    # Add Actual Validation Data trace
    actual_data_plot = validation_actuals[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=actual_data_plot[DATE_COLUMN],
        y=actual_data_plot[target_column],
        mode='lines+markers',
        name=f'Actual {target_label} (2025)',
        line=dict(color='blue'),
        marker=dict(size=4)
    ))

    # Add Predicted Validation Data trace (uses 'ds' from prediction results)
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat'],
        mode='lines',
        name=f'Predicted {target_label} (2025)',
        line=dict(color='red')
    ))

    # Add uncertainty interval for the predictions
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_upper'],
        mode='lines', name='Prediction Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_lower'],
        mode='lines', name='Prediction Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)', # Light red fill for uncertainty
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date (2025)',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig


# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting & Validation ---")

# Load Training data (2021-24)
df_train_full = load_and_preprocess_data(DATA_PATH_TRAIN, DATE_COLUMN, TARGET_COLUMNS, "Training")

# Load Validation data (2025)
df_val_full = load_and_preprocess_data(DATA_PATH_VALIDATION, DATE_COLUMN, TARGET_COLUMNS, "Validation")

# Proceed only if both datasets loaded successfully
if df_train_full is not None and df_val_full is not None:

    # --- Filtering Data Based on User Selections ---
    print(f"\nFiltering datasets for State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}'...")

    # Filter Training Data
    filter_cols = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Training data.")
         filtered_df_train = pd.DataFrame() # Empty df
    else:
        filtered_df_train = df_train_full[
            (df_train_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_train_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_train_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)

    # Filter Validation Data
    if not all(col in df_val_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Validation data.")
         filtered_df_val = pd.DataFrame() # Empty df
    else:
        filtered_df_val = df_val_full[
            (df_val_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_val_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_val_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    # Check if filtering resulted in data for BOTH sets
    if filtered_df_train.empty:
        print("\nWarning: No training data found for the selected combination after filtering.")
    if filtered_df_val.empty:
        print("\nWarning: No validation data found for the selected combination after filtering.")

    # Proceed only if BOTH filtered datasets have data
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nFound {len(filtered_df_train)} training points and {len(filtered_df_val)} validation points for the selection.")

        # Loop through each target price type
        for target in TARGET_COLUMNS:
            print("=" * 60)
            print(f"Processing Target: {target}")
            print("=" * 60)

            # --- Prepare Data for Current Target ---
            # Training data for this target
            target_df_train = filtered_df_train[[DATE_COLUMN, target]].dropna().copy()
            # Validation data for this target
            target_df_val = filtered_df_val[[DATE_COLUMN, target]].dropna().copy()

            if target_df_train.empty:
                print(f"Warning: No valid training data points for '{target}'. Skipping.")
                continue
            if target_df_val.empty:
                print(f"Warning: No valid validation data points for '{target}'. Skipping.")
                continue

            # --- Train Model ---
            model = train_prophet_model(target_df_train, target)

            if model is not None:
                # --- Predict on Validation Period ---
                print(f"\nPredicting on validation period (2025) for '{target}'...")
                # Prepare dataframe with validation dates for prediction
                future_df_val = target_df_val[[DATE_COLUMN]].rename(columns={DATE_COLUMN: 'ds'})

                try:
                    validation_forecast = model.predict(future_df_val)
                    print("Prediction on validation dates complete.")

                    # --- Evaluate on Validation Set ---
                    print(f"\n--- Evaluating Model Performance on 2025 Validation Data for {target} ---")

                    # Merge actuals and predictions based on date for robust alignment
                    # Use 'ds' from forecast and DATE_COLUMN from actuals
                    validation_results = pd.merge(
                        target_df_val[[DATE_COLUMN, target]],
                        validation_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
                        left_on=DATE_COLUMN,
                        right_on='ds',
                        how='inner' # Only compare dates present in both actuals and predictions
                    )

                    if not validation_results.empty:
                        actuals_val = validation_results[target]
                        preds_val = validation_results['yhat']

                        r2_val, mae_val, mse_val = calculate_metrics(actuals_val, preds_val)
                        print(f"Validation R-squared (R2): {r2_val:.4f}")
                        print(f"Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                        print(f"Validation Mean Squared Error (MSE): {mse_val:.2f}")

                        # --- Plot Validation Results ---
                        print(f"\n--- Plotting Validation Results for {target} (Actual vs. Predicted 2025) ---")
                        plot_title_val = f'Validation: {target.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted 2025)\n({SELECTED_COMMODITY_STR} in {SELECTED_DISTRICT_STR}, {SELECTED_STATE_STR})'
                        # Pass the original validation data (target_df_val) for actuals
                        # Pass the prediction results (validation_forecast - need aligned results)
                        fig_val = plot_validation_results(target_df_val, validation_results, target, plot_title_val)
                        fig_val.show()

                    else:
                        print("Warning: Could not align actual validation data with predictions. Skipping evaluation and plotting.")

                except Exception as e:
                    print(f"Error during prediction or evaluation on validation data: {e}")
            else:
                print(f"Skipping validation for {target} because model training failed.")

        print("-" * 60)
        print("\nForecasting and Validation process finished.")

    else:
         print("\nCannot proceed with forecasting/validation due to lack of data in filtered training or validation sets.")

else:
    print("\nFailed to load or preprocess one or both datasets. Cannot run the process.")

  from .autonotebook import tqdm as notebook_tqdm


--- Crop Price Time Series Forecasting & Validation ---
------------------------------
Processing Training Dataset
------------------------------
Loading Training data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\wheat_price(2021-24).csv...
Successfully loaded 29201 rows.
29201 rows after date parsing and dropping invalid dates.
29170 rows after initial broad dropna(). 31 rows removed.
29170 rows after ensuring target columns are numeric.
Dropped columns: ['calculationType', 'district_id', 'change']
Applying IQR Outlier Removal on columns: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Valid numeric columns for IQR: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Removed 1179 rows identified as outliers based on IQR.
Training data preprocessing complete. 27991 rows remaining.
------------------------------
Processing Validation Dataset
------------------------------
Loading Validation data from E:\elevatetrsest\crop price predictor\Crop_price_Predi

18:40:29 - cmdstanpy - INFO - Chain [1] start processing
18:40:29 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.

Predicting on validation period (2025) for 'avg_min_price'...
Prediction on validation dates complete.

--- Evaluating Model Performance on 2025 Validation Data for avg_min_price ---
Validation R-squared (R2): -0.4835
Validation Mean Absolute Error (MAE): 184.93
Validation Mean Squared Error (MSE): 51324.49

--- Plotting Validation Results for avg_min_price (Actual vs. Predicted 2025) ---


18:40:29 - cmdstanpy - INFO - Chain [1] start processing


Processing Target: avg_max_price

Training Prophet model for 'avg_max_price' using historical data...


18:40:29 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.

Predicting on validation period (2025) for 'avg_max_price'...
Prediction on validation dates complete.

--- Evaluating Model Performance on 2025 Validation Data for avg_max_price ---
Validation R-squared (R2): -0.1212
Validation Mean Absolute Error (MAE): 204.19
Validation Mean Squared Error (MSE): 81593.58

--- Plotting Validation Results for avg_max_price (Actual vs. Predicted 2025) ---


18:40:29 - cmdstanpy - INFO - Chain [1] start processing


Processing Target: avg_modal_price

Training Prophet model for 'avg_modal_price' using historical data...


18:40:30 - cmdstanpy - INFO - Chain [1] done processing


Model training complete.

Predicting on validation period (2025) for 'avg_modal_price'...
Prediction on validation dates complete.

--- Evaluating Model Performance on 2025 Validation Data for avg_modal_price ---
Validation R-squared (R2): -0.0477
Validation Mean Absolute Error (MAE): 163.06
Validation Mean Squared Error (MSE): 52700.31

--- Plotting Validation Results for avg_modal_price (Actual vs. Predicted 2025) ---


------------------------------------------------------------

Forecasting and Validation process finished.


## for one district only  (nashik)

In [4]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks
import numpy as np # Needed for IQR outlier check

# --- Configuration ---
# Training Data (Historical: 2002-2023, Nashik)
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv" # <--- UPDATED PATH
# Validation Data (Test: 2024, Nashik)
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv" # <--- UPDATED PATH

TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date' # Ensure this matches the column name in your CSVs
MIN_DATA_POINTS_TRAIN = 30 # Minimum data points required to train a model

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"  # Assuming Nashik is in Maharashtra
SELECTED_DISTRICT_STR = "Nashik"     # <--- UPDATED DISTRICT
SELECTED_COMMODITY_STR = "Wheat"    # Assuming Wheat is still the target commodity

# --- Outlier Removal Function (No changes needed) ---
def remove_outliers_iqr(df, columns_to_check):
    """Removes outliers from specified numerical columns using the IQR method."""
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns:
        print("Warning: No valid numeric columns found for IQR outlier removal.")
        return df_filtered
    print(f"Valid numeric columns for IQR: {valid_columns}")
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25)
    Q3 = subset_for_iqr.quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (Keeping checks for robustness) ---
def load_and_preprocess_data(path, date_col, target_cols, dataset_name="Training"):
    """Loads raw data and applies user-defined preprocessing steps."""
    print("-" * 30)
    print(f"Processing {dataset_name} Dataset")
    print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")

        # 1. Handle Date Column
        if date_col not in df.columns:
            print(f"Error: Date column '{date_col}' not found in {dataset_name} CSV.")
            return None
        # Assuming date might be in different formats, try multiple common formats
        try:
            df[date_col] = pd.to_datetime(df[date_col], errors='coerce') # Default format first
        except ValueError:
             print(f"Could not parse {date_col} with default format, trying DD/MM/YYYY...")
             try:
                  df[date_col] = pd.to_datetime(df[date_col], format='%d/%m/%Y', errors='coerce')
             except ValueError:
                  print(f"Could not parse {date_col} with DD/MM/YYYY, trying YYYY-MM-DD...")
                  try:
                      df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d', errors='coerce')
                  except Exception as e:
                       print(f"Error parsing {date_col} with multiple formats: {e}. Check date column format.")
                       return None # Give up if multiple formats fail

        df.dropna(subset=[date_col], inplace=True) # Drop if coercion failed
        print(f"{len(df)} rows after date parsing and dropping invalid dates.")

        # 2. Initial dropna() - keeping this step as a safety measure
        initial_rows = len(df)
        df.dropna(inplace=True)
        print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} rows removed.")

        # 3. Ensure Price columns are numeric
        for col in target_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Target column '{col}' not found in {dataset_name} data.")
        df.dropna(subset=target_cols, inplace=True, how='any') # Drop if coercion failed
        print(f"{len(df)} rows after ensuring target columns are numeric.")

        # 4. Drop specified unused columns (adjust if your 'processed' files don't have them)
        cols_to_drop = ['calculationType', 'district_id', 'change']
        # Also drop encoded columns if they exist from previous processing
        cols_to_drop.extend(['district_name_enc', 'commodity_name_enc', 'state_name_enc'])
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            print(f"Dropped potentially existing unused/encoded columns: {existing_cols_to_drop}")

        # 5. Apply IQR Outlier Removal - keeping as safety measure
        df = remove_outliers_iqr(df, target_cols)

        # Final check for required columns (original string columns needed for filtering)
        required_cols = [date_col] + target_cols + ['state_name', 'district_name', 'commodity_name']
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing in {dataset_name} data after preprocessing: {missing_req_cols}")
            print(f"Available columns: {df.columns.tolist()}")
            return None

        df.sort_values(date_col, inplace=True)
        print(f"{dataset_name} data preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError:
        print(f"Error: {dataset_name} data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing {dataset_name} data: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for complex errors
        return None

# --- Training Function (No changes needed) ---
def train_prophet_model(training_data, target_column):
    """Trains a Prophet model on historical data and returns the fitted model."""
    print(f"\nTraining Prophet model for '{target_column}' using historical data...")
    prophet_df = training_data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})
    if len(prophet_df) < MIN_DATA_POINTS_TRAIN:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' to train. Need at least {MIN_DATA_POINTS_TRAIN}. Skipping.")
        return None
    try:
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False, # Adjust seasonality as needed based on data patterns
            daily_seasonality=False
        )
        model.fit(prophet_df)
        print("Model training complete.")
        return model
    except Exception as e:
        print(f"Error during Prophet model training for {target_column}: {e}")
        return None

# --- Evaluation Metrics Function (No changes needed) ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if len(y_true) == 0 or len(y_pred) == 0:
        print("Warning: Empty arrays passed to calculate_metrics.")
        return np.nan, np.nan, np.nan
    if len(y_true) != len(y_pred):
        print(f"Warning: Length mismatch in calculate_metrics. True: {len(y_true)}, Pred: {len(y_pred)}")
        min_len = min(len(y_true), len(y_pred))
        if min_len == 0 : return np.nan, np.nan, np.nan
        y_true = y_true[:min_len]
        y_pred = y_pred[:min_len]
    try:
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return np.nan, np.nan, np.nan

# --- Plotting Function for Validation (Updated Title) ---
def plot_validation_results(validation_actuals, validation_predictions, target_column, title):
    """Creates a Plotly figure comparing actual validation data vs predictions."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()

    # Add Actual Validation Data trace
    actual_data_plot = validation_actuals[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=actual_data_plot[DATE_COLUMN],
        y=actual_data_plot[target_column],
        mode='lines+markers',
        name=f'Actual {target_label} (2024)', # <--- UPDATED YEAR
        line=dict(color='blue'),
        marker=dict(size=4)
    ))

    # Add Predicted Validation Data trace (uses 'ds' from prediction results)
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat'],
        mode='lines',
        name=f'Predicted {target_label} (2024)', # <--- UPDATED YEAR
        line=dict(color='red')
    ))

    # Add uncertainty interval for the predictions
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_upper'],
        mode='lines', name='Prediction Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_lower'],
        mode='lines', name='Prediction Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)', # Light red fill for uncertainty
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date (2024)', # <--- UPDATED YEAR
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig


# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting & Validation (Nashik: 2002-2023 Train, 2024 Validate) ---")

# Load Training data (2002-2023)
df_train_full = load_and_preprocess_data(DATA_PATH_TRAIN, DATE_COLUMN, TARGET_COLUMNS, "Training (2002-2023)")

# Load Validation data (2024)
df_val_full = load_and_preprocess_data(DATA_PATH_VALIDATION, DATE_COLUMN, TARGET_COLUMNS, "Validation (2024)")

# Proceed only if both datasets loaded successfully
if df_train_full is not None and df_val_full is not None:

    # --- Filtering Data Based on User Selections ---
    print(f"\nFiltering datasets for State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}'...")

    # Filter Training Data
    filter_cols = ['state_name', 'district_name', 'commodity_name']
    if not all(col in df_train_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Training data.")
         filtered_df_train = pd.DataFrame() # Empty df
    else:
        filtered_df_train = df_train_full[
            (df_train_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_train_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_train_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)

    # Filter Validation Data
    if not all(col in df_val_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Validation data.")
         filtered_df_val = pd.DataFrame() # Empty df
    else:
        filtered_df_val = df_val_full[
            (df_val_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_val_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_val_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    # Check if filtering resulted in data for BOTH sets
    if filtered_df_train.empty:
        print("\nWarning: No training data found for the selected combination after filtering.")
    if filtered_df_val.empty:
        print("\nWarning: No validation data found for the selected combination after filtering.")

    # Proceed only if BOTH filtered datasets have data
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nFound {len(filtered_df_train)} training points (Nashik/Wheat, 2002-2023) and {len(filtered_df_val)} validation points (Nashik/Wheat, 2024).")

        # Loop through each target price type
        for target in TARGET_COLUMNS:
            print("=" * 60)
            print(f"Processing Target: {target}")
            print("=" * 60)

            # --- Prepare Data for Current Target ---
            target_df_train = filtered_df_train[[DATE_COLUMN, target]].dropna().copy()
            target_df_val = filtered_df_val[[DATE_COLUMN, target]].dropna().copy()

            if target_df_train.empty:
                print(f"Warning: No valid training data points for '{target}'. Skipping.")
                continue
            if target_df_val.empty:
                print(f"Warning: No valid validation data points for '{target}'. Skipping.")
                continue

            # --- Train Model ---
            model = train_prophet_model(target_df_train, target)

            if model is not None:
                # --- Predict on Validation Period (2024) ---
                print(f"\nPredicting on validation period (2024) for '{target}'...")
                future_df_val = target_df_val[[DATE_COLUMN]].rename(columns={DATE_COLUMN: 'ds'})

                try:
                    validation_forecast = model.predict(future_df_val)
                    print("Prediction on validation dates complete.")

                    # --- Evaluate on Validation Set (2024) ---
                    print(f"\n--- Evaluating Model Performance on 2024 Validation Data for {target} ---")

                    validation_results = pd.merge(
                        target_df_val[[DATE_COLUMN, target]],
                        validation_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
                        left_on=DATE_COLUMN,
                        right_on='ds',
                        how='inner'
                    )

                    if not validation_results.empty:
                        actuals_val = validation_results[target]
                        preds_val = validation_results['yhat']

                        r2_val, mae_val, mse_val = calculate_metrics(actuals_val, preds_val)
                        print(f"Validation R-squared (R2): {r2_val:.4f}")
                        print(f"Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                        print(f"Validation Mean Squared Error (MSE): {mse_val:.2f}")

                        # --- Plot Validation Results ---
                        print(f"\n--- Plotting Validation Results for {target} (Actual vs. Predicted 2024) ---")
                        plot_title_val = f'Validation (Nashik/Wheat): {target.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted 2024)' # <--- UPDATED TITLE
                        fig_val = plot_validation_results(target_df_val, validation_results, target, plot_title_val)
                        fig_val.show()

                    else:
                        print("Warning: Could not align actual validation data with predictions. Skipping evaluation and plotting.")

                except Exception as e:
                    print(f"Error during prediction or evaluation on validation data: {e}")
                    import traceback
                    traceback.print_exc()
            else:
                print(f"Skipping validation for {target} because model training failed.")

        print("-" * 60)
        print("\nForecasting and Validation process finished.")

    else:
         print("\nCannot proceed with forecasting/validation due to lack of data in filtered training or validation sets.")

else:
    print("\nFailed to load or preprocess one or both datasets. Cannot run the process.")

--- Crop Price Time Series Forecasting & Validation (Nashik: 2002-2023 Train, 2024 Validate) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Successfully loaded 6246 rows.
6246 rows after date parsing and dropping invalid dates.
6246 rows after initial broad dropna(). 0 rows removed.
6246 rows after ensuring target columns are numeric.
Applying IQR Outlier Removal on columns: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Valid numeric columns for IQR: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Removed 10 rows identified as outliers based on IQR.
Training (2002-2023) data preprocessing complete. 6236 rows remaining.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price pr

AttributeError: Can only use .str accessor with string values!

## hyperparameter tuning (for non edited dataset)

In [7]:


import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display
import numpy as np
import optuna # <-- Import Optuna

# --- Configuration ---
# Training Data (Historical: 2002-2023, Nashik)
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
# Validation Data (Test: 2024, Nashik)
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date'
MIN_DATA_POINTS_TRAIN = 30

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"
SELECTED_DISTRICT_STR = "Nashik"
SELECTED_COMMODITY_STR = "Wheat"

# --- Optuna Configuration ---
N_TRIALS = 25 # Number of hyperparameter combinations to test (adjust based on time/resources)
# Note: Increasing N_TRIALS generally improves chances of finding better parameters but takes longer.

# --- Outlier Removal Function (No changes) ---
def remove_outliers_iqr(df, columns_to_check):
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns:
        print("Warning: No valid numeric columns found for IQR outlier removal.")
        return df_filtered
    # print(f"Valid numeric columns for IQR: {valid_columns}") # Less verbose
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25)
    Q3 = subset_for_iqr.quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0:
        print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (No changes) ---
def load_and_preprocess_data(path, date_col, target_cols, dataset_name="Training"):
    print("-" * 30)
    print(f"Processing {dataset_name} Dataset")
    print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")
        # ... (rest of the function is the same as previous version) ...
        # 1. Handle Date Column
        if date_col not in df.columns:
            print(f"Error: Date column '{date_col}' not found in {dataset_name} CSV.")
            return None
        try:
            df[date_col] = pd.to_datetime(df[date_col], errors='coerce') # Default format first
        except ValueError:
             print(f"Could not parse {date_col} with default format, trying DD/MM/YYYY...")
             try:
                  df[date_col] = pd.to_datetime(df[date_col], format='%d/%m/%Y', errors='coerce')
             except ValueError:
                  print(f"Could not parse {date_col} with DD/MM/YYYY, trying YYYY-MM-DD...")
                  try:
                      df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d', errors='coerce')
                  except Exception as e:
                       print(f"Error parsing {date_col} with multiple formats: {e}. Check date column format.")
                       return None

        df.dropna(subset=[date_col], inplace=True)
        print(f"{len(df)} rows after date parsing and dropping invalid dates.")

        # 2. Initial dropna()
        initial_rows = len(df)
        df.dropna(inplace=True)
        if initial_rows > len(df):
            print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} rows removed.")

        # 3. Ensure Price columns are numeric
        for col in target_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            else:
                print(f"Warning: Target column '{col}' not found in {dataset_name} data.")
        df.dropna(subset=target_cols, inplace=True, how='any')
        print(f"{len(df)} rows after ensuring target columns are numeric.")

        # 4. Drop specified unused columns
        cols_to_drop = ['calculationType', 'district_id', 'change']
        cols_to_drop.extend(['district_name_enc', 'commodity_name_enc', 'state_name_enc']) # Drop potential old encoded cols
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            print(f"Dropped potentially existing unused/encoded columns: {existing_cols_to_drop}")

        # 5. Apply IQR Outlier Removal
        df = remove_outliers_iqr(df, target_cols)

        # Final check for required columns
        required_cols = [date_col] + target_cols + ['state_name', 'district_name', 'commodity_name']
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing in {dataset_name} data after preprocessing: {missing_req_cols}")
            print(f"Available columns: {df.columns.tolist()}")
            return None

        df.sort_values(date_col, inplace=True)
        print(f"{dataset_name} data preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError:
        print(f"Error: {dataset_name} data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing {dataset_name} data: {e}")
        import traceback
        traceback.print_exc()
        return None


# --- Optuna Objective Function ---
def objective(trial, train_df, val_df, target_col, date_col):
    """Objective function for Optuna to minimize (e.g., validation MAE)."""
    # 1. Suggest Hyperparameters
    changepoint_prior_scale = trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True)
    seasonality_prior_scale = trial.suggest_float('seasonality_prior_scale', 0.01, 10.0, log=True)
    seasonality_mode = trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative'])
    # holidays_prior_scale = trial.suggest_float('holidays_prior_scale', 0.01, 10.0, log=True) # If using holidays

    params = {
        'changepoint_prior_scale': changepoint_prior_scale,
        'seasonality_prior_scale': seasonality_prior_scale,
        'seasonality_mode': seasonality_mode,
        'yearly_seasonality': True, # Keep other settings consistent
        'weekly_seasonality': False,
        'daily_seasonality': False
        # 'holidays_prior_scale': holidays_prior_scale # If using holidays
    }

    try:
        # 2. Train Model with Suggested Parameters
        prophet_df_train = train_df[[date_col, target_col]].rename(columns={date_col: 'ds', target_col: 'y'})

        # Check for sufficient data points within the objective trial's training split
        if len(prophet_df_train) < MIN_DATA_POINTS_TRAIN:
             # print(f"Trial {trial.number}: Skipping due to insufficient training data ({len(prophet_df_train)} points).")
             # Return a high error value if a trial cannot run
             return float('inf')

        model = Prophet(**params)
        model.fit(prophet_df_train)

        # 3. Predict on Validation Dates
        future_df_val = val_df[[date_col]].rename(columns={date_col: 'ds'})
        forecast = model.predict(future_df_val)

        # 4. Evaluate (Calculate MAE on Validation Set)
        # Merge actuals and predictions
        results = pd.merge(
            val_df[[date_col, target_col]],
            forecast[['ds', 'yhat']],
            left_on=date_col,
            right_on='ds',
            how='inner'
        )

        if results.empty:
             # print(f"Trial {trial.number}: No matching dates between validation actuals and forecast.")
             return float('inf') # Return high error if alignment fails

        mae = mean_absolute_error(results[target_col], results['yhat'])
        # print(f"Trial {trial.number}: MAE = {mae:.2f} with params {params}") # Verbose output for debugging
        return mae # Optuna minimizes this value

    except Exception as e:
        # Handle potential errors during model fitting/prediction (e.g., invalid params)
        print(f"Trial {trial.number} failed: {e}")
        return float('inf') # Return a large value to penalize failed trials


# --- MODIFIED Training Function (Accepts Params) ---
def train_prophet_model(training_data, target_column, params):
    """Trains a Prophet model on historical data using provided hyperparameters."""
    print(f"\nTraining FINAL Prophet model for '{target_column}' using best parameters...")
    print(f"Best Params: {params}")

    prophet_df = training_data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    if len(prophet_df) < MIN_DATA_POINTS_TRAIN:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' to train final model. Skipping.")
        return None
    try:
        # Instantiate model with the best parameters found by Optuna
        model = Prophet(**params)
        model.fit(prophet_df)
        print("Final model training complete.")
        return model
    except Exception as e:
        print(f"Error during FINAL Prophet model training for {target_column}: {e}")
        return None

# --- Evaluation Metrics Function (No changes) ---
def calculate_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if len(y_true) == 0 or len(y_pred) == 0:
        print("Warning: Empty arrays passed to calculate_metrics.")
        return np.nan, np.nan, np.nan
    if len(y_true) != len(y_pred):
        print(f"Warning: Length mismatch in calculate_metrics. True: {len(y_true)}, Pred: {len(y_pred)}")
        min_len = min(len(y_true), len(y_pred))
        if min_len == 0 : return np.nan, np.nan, np.nan
        y_true = y_true[:min_len]
        y_pred = y_pred[:min_len]
    try:
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return np.nan, np.nan, np.nan


# --- Plotting Function for Validation (No changes) ---
def plot_validation_results(validation_actuals, validation_predictions, target_column, title):
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()
    # ... (rest of the function is the same as previous version) ...
    # Add Actual Validation Data trace
    actual_data_plot = validation_actuals[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=actual_data_plot[DATE_COLUMN],
        y=actual_data_plot[target_column],
        mode='lines+markers',
        name=f'Actual {target_label} (2024)',
        line=dict(color='blue'),
        marker=dict(size=4)
    ))

    # Add Predicted Validation Data trace
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat'],
        mode='lines',
        name=f'Predicted {target_label} (2024)',
        line=dict(color='red')
    ))

    # Add uncertainty interval
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_upper'],
        mode='lines', name='Prediction Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=validation_predictions['ds'],
        y=validation_predictions['yhat_lower'],
        mode='lines', name='Prediction Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)',
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date (2024)',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig


# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---")
print(f"--- (Nashik: 2002-2023 Train, 2024 Validate) ---")

# Load Training data (2002-2023)
df_train_full = load_and_preprocess_data(DATA_PATH_TRAIN, DATE_COLUMN, TARGET_COLUMNS, "Training (2002-2023)")

# Load Validation data (2024)
df_val_full = load_and_preprocess_data(DATA_PATH_VALIDATION, DATE_COLUMN, TARGET_COLUMNS, "Validation (2024)")

# Proceed only if both datasets loaded successfully
if df_train_full is not None and df_val_full is not None:

    # --- Filtering Data Based on User Selections ---
    print(f"\nFiltering datasets for State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}'...")
    filter_cols = ['state_name', 'district_name', 'commodity_name']

    # Filter Training Data
    if not all(col in df_train_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Training data.")
         filtered_df_train = pd.DataFrame()
    else:
        filtered_df_train = df_train_full[
            (df_train_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_train_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_train_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True)

    # Filter Validation Data
    if not all(col in df_val_full.columns for col in filter_cols):
         print("Error: Filter columns missing in Validation data.")
         filtered_df_val = pd.DataFrame()
    else:
        filtered_df_val = df_val_full[
            (df_val_full['state_name'].str.strip().str.lower() == SELECTED_STATE_STR.strip().lower()) &
            (df_val_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT_STR.strip().lower()) &
            (df_val_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY_STR.strip().lower())
        ].copy()
        filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True)

    # Check if filtering resulted in data for BOTH sets
    if filtered_df_train.empty:
        print("\nWarning: No training data found for the selected combination after filtering.")
    if filtered_df_val.empty:
        print("\nWarning: No validation data found for the selected combination after filtering.")

    # Proceed only if BOTH filtered datasets have data
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nFound {len(filtered_df_train)} training points and {len(filtered_df_val)} validation points for the selection.")

        # --- Loop through each target price type ---
        for target in TARGET_COLUMNS:
            print("=" * 60)
            print(f"Processing Target: {target}")
            print("=" * 60)

            # Prepare Data for Current Target
            target_df_train = filtered_df_train[[DATE_COLUMN, target]].dropna().copy()
            target_df_val = filtered_df_val[[DATE_COLUMN, target]].dropna().copy()

            if target_df_train.empty or target_df_val.empty:
                print(f"Warning: Insufficient valid data points for train/validation for '{target}'. Skipping.")
                continue

            # --- Hyperparameter Tuning with Optuna ---
            print(f"\nStarting Optuna hyperparameter tuning for '{target}'...")
            # Use a lambda to pass additional arguments to the objective function
            study = optuna.create_study(direction='minimize') # We want to minimize MAE/MSE
            try:
                 # Set timeout for study if needed: timeout=600 (in seconds)
                study.optimize(lambda trial: objective(trial, target_df_train, target_df_val, target, DATE_COLUMN),
                               n_trials=N_TRIALS)

                best_params = study.best_params
                print(f"Optuna finished. Best MAE found during tuning: {study.best_value:.4f}")
                print(f"Best hyperparameters found: {best_params}")

                # Ensure default seasonalities are added if not tuned ( Prophet requires them )
                final_params = {
                    'yearly_seasonality': True,
                    'weekly_seasonality': False,
                    'daily_seasonality': False,
                     **best_params # Add the tuned parameters, overriding defaults if necessary
                }


            except Exception as e:
                print(f"Optuna study failed for {target}: {e}")
                print("Proceeding with default Prophet parameters.")
                # Define default parameters if tuning fails
                final_params = {
                    'changepoint_prior_scale': 0.05, # Default value
                    'seasonality_prior_scale': 10.0, # Default value
                    'seasonality_mode': 'additive', # Default value
                    'yearly_seasonality': True,
                    'weekly_seasonality': False,
                    'daily_seasonality': False
                }


            # --- Train FINAL Model with Best (or Default) Params ---
            # Pass the found best parameters (or defaults if tuning failed)
            model = train_prophet_model(target_df_train, target, final_params)

            if model is not None:
                # --- Predict on Validation Period (2024) using FINAL model ---
                print(f"\nPredicting on validation period (2024) for '{target}' using the final tuned model...")
                future_df_val = target_df_val[[DATE_COLUMN]].rename(columns={DATE_COLUMN: 'ds'})
                try:
                    validation_forecast = model.predict(future_df_val)
                    print("Prediction on validation dates complete.")

                    # --- Evaluate FINAL Model on Validation Set (2024) ---
                    print(f"\n--- Evaluating FINAL Tuned Model Performance on 2024 Validation Data for {target} ---")
                    validation_results = pd.merge(
                        target_df_val[[DATE_COLUMN, target]],
                        validation_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
                        left_on=DATE_COLUMN, right_on='ds', how='inner'
                    )

                    if not validation_results.empty:
                        actuals_val = validation_results[target]
                        preds_val = validation_results['yhat']
                        r2_val, mae_val, mse_val = calculate_metrics(actuals_val, preds_val)
                        print(f"FINAL Validation R-squared (R2): {r2_val:.4f}")
                        print(f"FINAL Validation Mean Absolute Error (MAE): {mae_val:.2f}")
                        print(f"FINAL Validation Mean Squared Error (MSE): {mse_val:.2f}")

                        # --- Plot FINAL Validation Results ---
                        print(f"\n--- Plotting FINAL Validation Results for {target} (Actual vs. Predicted 2024) ---")
                        plot_title_val = f'TUNED Validation (Nashik/Wheat): {target.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted 2024)'
                        fig_val = plot_validation_results(target_df_val, validation_results, target, plot_title_val)
                        fig_val.show()
                    else:
                        print("Warning: Could not align actual validation data with FINAL predictions. Skipping evaluation and plotting.")

                except Exception as e:
                    print(f"Error during prediction or evaluation on validation data using FINAL model: {e}")
                    import traceback
                    traceback.print_exc()
            else:
                print(f"Skipping validation for {target} because final model training failed.")

        print("-" * 60)
        print("\nForecasting, Tuning, and Validation process finished.")

    else:
         print("\nCannot proceed due to lack of data in filtered training or validation sets.")
else:
    print("\nFailed to load or preprocess one or both datasets. Cannot run the process.")

--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---
--- (Nashik: 2002-2023 Train, 2024 Validate) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Successfully loaded 6246 rows.
6246 rows after date parsing and dropping invalid dates.
6246 rows after ensuring target columns are numeric.
Applying IQR Outlier Removal on columns: ['avg_min_price', 'avg_max_price', 'avg_modal_price']
Removed 10 rows identified as outliers based on IQR.
Training (2002-2023) data preprocessing complete. 6236 rows remaining.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Successfully loaded 6246 rows.
6246 rows after d

AttributeError: Can only use .str accessor with string values!

## hyperparameter tuning for edited datasets 

Prophet Hyperparameters: Identify the key hyperparameters in Prophet that significantly affect performance and are suitable for tuning:

<b>changepoint_prior_scale:</b> Controls the flexibility of the trend changes. Higher values allow more flexibility (risk of overfitting), lower values make it more rigid (risk of underfitting). <br>

<b>Range</b>: Usually (0.001, 0.5), sometimes up to 1.0.<br>

<b>seasonality_prior_scale</b>: Controls the flexibility of the seasonality component. Higher values allow larger seasonal fluctuations. Range: Usually (0.01, 10.0). <br>

<b>holidays_prior_scale (if holidays are used)</b>: Controls flexibility for holiday effects.<br>
 Range: Usually (0.01, 10.0). (Note: Holidays aren't currently used in the script, so this isn't applicable yet, but good to keep in mind).<br>

<b>seasonality_mode:</b> Can be 'additive' or 'multiplicative'. This is a categorical choice, not continuous. Multiplicative is often better when the seasonality's magnitude grows with the trend.
changepoint_range: Proportion of history where changepoints are allowed. Default is 0.8 (80%). Range: (0.8, 0.95).


In [9]:
# !pip install optun

import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display
import numpy as np
import optuna
import traceback

# --- Configuration ---
# Training Data (Historical: 2002-2023, Nashik - PRE-ENCODED)
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
# Validation Data (Test: 2024, Nashik - PRE-ENCODED)
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
# DATE_COLUMN will be *created* from year, month, day_col
DATE_COLUMN = 'full_date' # Define the name for the combined date column
YEAR_COL = 'year'         # Column name for year
MONTH_COL = 'month'       # Column name for month
DAY_COL = 'date'          # Column name for day (as specified by user)

MIN_DATA_POINTS_TRAIN = 30

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"
SELECTED_DISTRICT_STR = "Nashik"
SELECTED_COMMODITY_STR = "Wheat"

# --- Frequency Encoding Mappings (CRITICAL: REPLACE WITH YOUR ACTUAL MAPPINGS) ---
state_name_encoding_map = {"maharashtra": 6291} # Replace with actual
district_name_encoding_map = {"nashik": 6291}      # Replace with actual
commodity_name_encoding_map = {"wheat": 6291}       # Replace with actual

# --- Optuna Configuration ---
N_TRIALS = 25

# --- Outlier Removal Function (No changes) ---
def remove_outliers_iqr(df, columns_to_check):
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    # print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (MODIFIED DATE HANDLING) ---
def load_and_preprocess_data(path, date_col_name, year_col, month_col, day_col, target_cols, dataset_name="Training"):
    """Loads data and constructs the date from year, month, day columns."""
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")

        # --- 1. Construct Date Column from year, month, day ---
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols):
            missing_dt_cols = [col for col in date_components_cols if col not in df.columns]
            print(f"Error: Date component columns missing: {missing_dt_cols}")
            return None

        # Ensure components are numeric before assembly
        for col in date_components_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True) # Drop rows if year/month/day invalid

        print(f"Constructing '{date_col_name}' from '{year_col}', '{month_col}', '{day_col}' columns...")
        # Assemble the date - IMPORTANT CORRECTION based on user input
        df[date_col_name] = pd.to_datetime({
            'year': df[year_col],
            'month': df[month_col],
            'day': df[day_col]  # Use the column user specified for day
        }, errors='coerce')

        # Drop rows where constructed date is invalid (NaT)
        initial_rows_date = len(df)
        df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df):
             print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date component combinations.")
        print(f"{len(df)} rows after date construction and validation.")
        # --- End Date Construction ---

        # 2. Initial dropna()
        initial_rows = len(df); df.dropna(inplace=True)
        if initial_rows > len(df): print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} removed.")

        # 3. Ensure Price columns are numeric
        for col in target_cols:
            if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
            else: print(f"Warning: Target column '{col}' not found.")
        df.dropna(subset=target_cols, inplace=True, how='any')
        print(f"{len(df)} rows after ensuring target columns numeric.")

        # 4. Drop OTHER unused columns
        cols_to_drop = ['calculationType', 'district_id', 'change',
                        'district_name_enc', 'commodity_name_enc', 'state_name_enc']
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            # print(f"Dropped potentially existing unused columns: {existing_cols_to_drop}") # Less verbose

        # 5. Apply IQR Outlier Removal
        df = remove_outliers_iqr(df, target_cols)

        # Final check for required columns
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        required_cols = [date_col_name] + target_cols + required_numeric_filter_cols # Use constructed date_col_name
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None

        df.sort_values(date_col_name, inplace=True) # Sort by constructed date
        print(f"{dataset_name} data preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None

# --- Optuna Objective Function (Use DATE_COLUMN name) ---
def objective(trial, train_df, val_df, target_col, date_col_name): # Use date_col_name
    changepoint_prior_scale = trial.suggest_float('changepoint_prior_scale', 0.001, 0.5, log=True)
    seasonality_prior_scale = trial.suggest_float('seasonality_prior_scale', 0.01, 10.0, log=True)
    seasonality_mode = trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative'])
    params = {
        'changepoint_prior_scale': changepoint_prior_scale, 'seasonality_prior_scale': seasonality_prior_scale,
        'seasonality_mode': seasonality_mode, 'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False
    }
    try:
        prophet_df_train = train_df[[date_col_name, target_col]].rename(columns={date_col_name: 'ds', target_col: 'y'})
        if len(prophet_df_train) < MIN_DATA_POINTS_TRAIN: return float('inf')
        model = Prophet(**params); model.fit(prophet_df_train)
        future_df_val = val_df[[date_col_name]].rename(columns={date_col_name: 'ds'}) # Use date_col_name
        forecast = model.predict(future_df_val)
        results = pd.merge(val_df[[date_col_name, target_col]], forecast[['ds', 'yhat']],
                           left_on=date_col_name, right_on='ds', how='inner') # Use date_col_name
        if results.empty: return float('inf')
        mae = mean_absolute_error(results[target_col], results['yhat'])
        return mae
    except Exception as e: return float('inf')

# --- Training Function (Use DATE_COLUMN name) ---
def train_prophet_model(training_data, target_column, date_col_name, params): # Use date_col_name
    print(f"\nTraining FINAL Prophet model for '{target_column}' using best parameters..."); print(f"Best Params: {params}")
    prophet_df = training_data[[date_col_name, target_column]].rename(columns={date_col_name: 'ds', target_column: 'y'})
    if len(prophet_df) < MIN_DATA_POINTS_TRAIN: print(f"Warning: Insufficient data ({len(prophet_df)} points). Skipping."); return None
    try:
        model = Prophet(**params); model.fit(prophet_df)
        print("Final model training complete."); return model
    except Exception as e: print(f"Error during FINAL training for {target_column}: {e}"); return None

# --- Evaluation Metrics Function (No changes) ---
def calculate_metrics(y_true, y_pred):
    y_true = np.array(y_true); y_pred = np.array(y_pred)
    if len(y_true) == 0 or len(y_pred) == 0: return np.nan, np.nan, np.nan
    if len(y_true) != len(y_pred):
        min_len = min(len(y_true), len(y_pred)); print(f"Warn: Mismatch metrics. Truncating to {min_len}.");
        if min_len == 0 : return np.nan, np.nan, np.nan
        y_true = y_true[:min_len]; y_pred = y_pred[:min_len]
    try:
        r2 = r2_score(y_true, y_pred); mae = mean_absolute_error(y_true, y_pred); mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e: print(f"Error calculating metrics: {e}"); return np.nan, np.nan, np.nan

# --- Plotting Function (Use DATE_COLUMN name) ---
def plot_validation_results(validation_actuals, validation_predictions, target_column, date_col_name, title): # Use date_col_name
    fig = go.Figure(); target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()
    actual_data_plot = validation_actuals[[date_col_name, target_column]].dropna()
    fig.add_trace(go.Scatter(x=actual_data_plot[date_col_name], y=actual_data_plot[target_column], mode='lines+markers', name=f'Actual {target_label} (2024)', line=dict(color='blue'), marker=dict(size=4)))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat'], mode='lines', name=f'Predicted {target_label} (2024)', line=dict(color='red')))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat_upper'], mode='lines', name='Upper Bound', line=dict(width=0), showlegend=False))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat_lower'], mode='lines', name='Lower Bound', line=dict(width=0), fillcolor='rgba(255, 0, 0, 0.2)', fill='tonexty', showlegend=False))
    fig.update_layout(title=title, xaxis_title='Date (2024)', yaxis_title=f'Price ({target_label})', hovermode="x unified", legend_title_text='Legend')
    return fig


# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---")
print(f"--- (Nashik: 2002-2023 Train, 2024 Validate - Using Pre-Encoded Filters & Constructed Date) ---") # Updated title

# Load Training data (pass date component column names)
df_train_full = load_and_preprocess_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")

# Load Validation data (pass date component column names)
df_val_full = load_and_preprocess_data(DATA_PATH_VALIDATION, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Validation (2024)")


# Proceed only if both datasets loaded successfully
if df_train_full is not None and df_val_full is not None:

    # --- Get Encoded Values for Filtering ---
    try:
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing in map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing in map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing in map."); lookup_failed=True
        if lookup_failed: print("Check encoding maps."); df_train_full=df_val_full=None
        else: print(f"\nSelected: State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}' -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")
    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_full = df_val_full = None


# Proceed only if lookup succeeded
if df_train_full is not None and df_val_full is not None:

    # --- Filtering Data Based on ENCODED Values ---
    print(f"\nFiltering datasets using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']

    # Filter Training Data
    if not all(col in df_train_full.columns for col in filter_cols_num): print("Error: Encoded filter cols missing in Training data."); filtered_df_train = pd.DataFrame()
    else:
        filtered_df_train = df_train_full[(df_train_full['state_name'] == encoded_state) & (df_train_full['district_name'] == encoded_district) & (df_train_full['commodity_name'] == encoded_commodity)].copy()
        filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True) # Sort by constructed date

    # Filter Validation Data
    if not all(col in df_val_full.columns for col in filter_cols_num): print("Error: Encoded filter cols missing in Validation data."); filtered_df_val = pd.DataFrame()
    else:
        filtered_df_val = df_val_full[(df_val_full['state_name'] == encoded_state) & (df_val_full['district_name'] == encoded_district) & (df_val_full['commodity_name'] == encoded_commodity)].copy()
        filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True) # Sort by constructed date

    if filtered_df_train.empty: print("\nWarning: No training data found after filtering.")
    if filtered_df_val.empty: print("\nWarning: No validation data found after filtering.")

    # Proceed only if BOTH filtered datasets have data
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nFound {len(filtered_df_train)} training points and {len(filtered_df_val)} validation points after filtering.")

        # --- Loop through each target price type ---
        for target in TARGET_COLUMNS:
            print("=" * 60); print(f"Processing Target: {target}"); print("=" * 60)
            target_df_train = filtered_df_train[[DATE_COLUMN, target]].dropna().copy() # Use constructed date
            target_df_val = filtered_df_val[[DATE_COLUMN, target]].dropna().copy()   # Use constructed date

            if target_df_train.empty or target_df_val.empty: print(f"Warning: Insufficient data for train/val for '{target}'. Skipping."); continue

            # --- Hyperparameter Tuning with Optuna ---
            print(f"\nStarting Optuna tuning for '{target}' ({N_TRIALS} trials)...")
            study = optuna.create_study(direction='minimize')
            try:
                objective_func = lambda trial: objective(trial, target_df_train, target_df_val, target, DATE_COLUMN) # Pass constructed date col name
                study.optimize(objective_func, n_trials=N_TRIALS)
                best_params = study.best_params; print(f"Optuna finished. Best MAE: {study.best_value:.4f}")
                final_params = {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, **best_params}
            except Exception as e:
                print(f"Optuna study failed: {e}. Using defaults."); traceback.print_exc()
                final_params = {'changepoint_prior_scale': 0.05, 'seasonality_prior_scale': 10.0, 'seasonality_mode': 'additive',
                                'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False}

            # --- Train FINAL Model ---
            model = train_prophet_model(target_df_train, target, DATE_COLUMN, final_params) # Pass constructed date col name

            if model is not None:
                # --- Predict on Validation Period (2024) ---
                print(f"\nPredicting on validation period (2024) for '{target}'...")
                future_df_val = target_df_val[[DATE_COLUMN]].rename(columns={DATE_COLUMN: 'ds'}) # Use constructed date
                try:
                    validation_forecast = model.predict(future_df_val); print("Prediction complete.")

                    # --- Evaluate FINAL Model ---
                    print(f"\n--- Evaluating FINAL Model on 2024 Data for {target} ---")
                    validation_results = pd.merge(target_df_val[[DATE_COLUMN, target]], validation_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
                                                  left_on=DATE_COLUMN, right_on='ds', how='inner') # Use constructed date

                    if not validation_results.empty:
                        actuals_val=validation_results[target]; preds_val=validation_results['yhat']
                        r2_val, mae_val, mse_val = calculate_metrics(actuals_val, preds_val)
                        print(f"FINAL Validation R2: {r2_val:.4f}, MAE: {mae_val:.2f}, MSE: {mse_val:.2f}")

                        # --- Plot FINAL Validation Results ---
                        print(f"\n--- Plotting FINAL Validation Results for {target} (Actual vs. Predicted 2024) ---")
                        plot_title_val = f'TUNED Validation (Nashik/Wheat): {target.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted 2024)'
                        fig_val = plot_validation_results(target_df_val, validation_results, target, DATE_COLUMN, plot_title_val) # Pass constructed date col name
                        fig_val.show()
                    else: print("Warning: Could not align validation actuals/predictions.")
                except Exception as e: print(f"Error during FINAL prediction/evaluation: {e}"); traceback.print_exc()
            else: print(f"Skipping validation for {target} (final model failed).")
        print("-" * 60); print("\nProcess finished.")
    else: print("\nCannot proceed: lack of data after filtering.")
else: print("\nFailed: check data loading, preprocessing, or mapping lookup.")

[I 2025-04-08 20:28:21,450] A new study created in memory with name: no-name-2d14991a-0a1c-4bbe-a69e-08379a4963af


--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---
--- (Nashik: 2002-2023 Train, 2024 Validate - Using Pre-Encoded Filters & Constructed Date) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Successfully loaded 6246 rows.
Constructing 'full_date' from 'year', 'month', 'date' columns...
6246 rows after date construction and validation.
6246 rows after ensuring target columns numeric.
Removed 10 rows identified as outliers based on IQR.
Training (2002-2023) data preprocessing complete. 6236 rows remaining.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Successfully loaded 6246 rows.
Constructin

20:28:21 - cmdstanpy - INFO - Chain [1] start processing
20:28:23 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:28:24,465] Trial 0 finished with value: 85.97902834548958 and parameters: {'changepoint_prior_scale': 0.05284640798090942, 'seasonality_prior_scale': 0.013255978873866587, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 85.97902834548958.
20:28:24 - cmdstanpy - INFO - Chain [1] start processing
20:28:26 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:28:27,293] Trial 1 finished with value: 79.06678693893342 and parameters: {'changepoint_prior_scale': 0.14947803806795448, 'seasonality_prior_scale': 0.48230311735564213, 'seasonality_mode': 'additive'}. Best is trial 1 with value: 79.06678693893342.
20:28:27 - cmdstanpy - INFO - Chain [1] start processing
20:28:28 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:28:29,013] Trial 2 finished with value: 108.15143171159505 and parameters: {'changepoint_prior_scale': 0.00

Optuna finished. Best MAE: 77.0438

Training FINAL Prophet model for 'avg_min_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.47482865884783115, 'seasonality_prior_scale': 0.07803845592061813, 'seasonality_mode': 'multiplicative'}


20:29:45 - cmdstanpy - INFO - Chain [1] start processing
20:29:47 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_min_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_min_price ---
FINAL Validation R2: 0.9253, MAE: 77.04, MSE: 13808.23

--- Plotting FINAL Validation Results for avg_min_price (Actual vs. Predicted 2024) ---


[I 2025-04-08 20:29:48,433] A new study created in memory with name: no-name-167dfaf2-5507-4bbc-b1f3-c6abe2514e72


Processing Target: avg_max_price

Starting Optuna tuning for 'avg_max_price' (25 trials)...


20:29:49 - cmdstanpy - INFO - Chain [1] start processing
20:29:50 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:29:51,432] Trial 0 finished with value: 145.07783244625134 and parameters: {'changepoint_prior_scale': 0.010386090970215802, 'seasonality_prior_scale': 0.06739703645806368, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 145.07783244625134.
20:29:51 - cmdstanpy - INFO - Chain [1] start processing
20:29:52 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:29:53,316] Trial 1 finished with value: 146.74641449507308 and parameters: {'changepoint_prior_scale': 0.008308341168756967, 'seasonality_prior_scale': 0.7648606624247244, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 145.07783244625134.
20:29:53 - cmdstanpy - INFO - Chain [1] start processing
20:29:54 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:29:54,913] Trial 2 finished with value: 153.72990948703503 and parameters: {'changepoint_prior_sc

Optuna finished. Best MAE: 100.4942

Training FINAL Prophet model for 'avg_max_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.4972630652962813, 'seasonality_prior_scale': 0.020950333285489, 'seasonality_mode': 'multiplicative'}


20:30:59 - cmdstanpy - INFO - Chain [1] start processing
20:31:02 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_max_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_max_price ---
FINAL Validation R2: 0.9130, MAE: 100.49, MSE: 22079.85

--- Plotting FINAL Validation Results for avg_max_price (Actual vs. Predicted 2024) ---


[I 2025-04-08 20:31:02,767] A new study created in memory with name: no-name-a60b2e5d-fc40-4181-b9dc-a83bc1a6ba3f


Processing Target: avg_modal_price

Starting Optuna tuning for 'avg_modal_price' (25 trials)...


20:31:03 - cmdstanpy - INFO - Chain [1] start processing
20:31:05 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:31:05,870] Trial 0 finished with value: 83.57348312340885 and parameters: {'changepoint_prior_scale': 0.3012009149899424, 'seasonality_prior_scale': 0.08379263354035059, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 83.57348312340885.
20:31:06 - cmdstanpy - INFO - Chain [1] start processing
20:31:07 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:31:07,887] Trial 1 finished with value: 116.41619347892549 and parameters: {'changepoint_prior_scale': 0.019937179158464418, 'seasonality_prior_scale': 0.22486796284355484, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 83.57348312340885.
20:31:08 - cmdstanpy - INFO - Chain [1] start processing
20:31:08 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-08 20:31:09,416] Trial 2 finished with value: 131.0277104958872 and parameters: {'changepoint_prior_scale':

Optuna finished. Best MAE: 83.3868

Training FINAL Prophet model for 'avg_modal_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.4902266644284863, 'seasonality_prior_scale': 6.885695548575684, 'seasonality_mode': 'additive'}


20:32:17 - cmdstanpy - INFO - Chain [1] start processing
20:32:19 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_modal_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_modal_price ---
FINAL Validation R2: 0.9240, MAE: 83.39, MSE: 16359.21

--- Plotting FINAL Validation Results for avg_modal_price (Actual vs. Predicted 2024) ---


------------------------------------------------------------

Process finished.


### final validation on 2024 nashik data

In [5]:
# !pip install optun

import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display
import numpy as np
import optuna
import traceback

# --- Configuration ---
# Training Data (Historical: 2002-2023, Nashik - PRE-ENCODED)
DATA_PATH_TRAIN = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv"
# Validation Data (Test: 2024, Nashik - PRE-ENCODED)
DATA_PATH_VALIDATION = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv"

TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
# DATE_COLUMN will be *created* from year, month, day_col
DATE_COLUMN = 'full_date' # Define the name for the combined date column
YEAR_COL = 'year'         # Column name for year
MONTH_COL = 'month'       # Column name for month
DAY_COL = 'date'          # Column name for day (as specified by user)

MIN_DATA_POINTS_TRAIN = 30

# --- User Selections (Strings) ---
SELECTED_STATE_STR = "Maharashtra"
SELECTED_DISTRICT_STR = "Nashik"
SELECTED_COMMODITY_STR = "Wheat"

# --- Frequency Encoding Mappings (CRITICAL: REPLACE WITH YOUR ACTUAL MAPPINGS) ---
state_name_encoding_map = {"maharashtra": 6291} # Replace with actual
district_name_encoding_map = {"nashik": 6291}      # Replace with actual
commodity_name_encoding_map = {"wheat": 6291}       # Replace with actual

# --- Optuna Configuration ---
N_TRIALS = 25

# --- Outlier Removal Function (No changes) ---
def remove_outliers_iqr(df, columns_to_check):
    df_filtered = df.copy()
    initial_rows = len(df_filtered)
    # print(f"Applying IQR Outlier Removal on columns: {columns_to_check}")
    valid_columns = [col for col in columns_to_check if col in df_filtered.columns and pd.api.types.is_numeric_dtype(df_filtered[col])]
    if not valid_columns: return df_filtered
    subset_for_iqr = df_filtered[valid_columns]
    Q1 = subset_for_iqr.quantile(0.25); Q3 = subset_for_iqr.quantile(0.75); IQR = Q3 - Q1
    mask = ~((subset_for_iqr < (Q1 - 1.5 * IQR)) | (subset_for_iqr > (Q3 + 1.5 * IQR))).any(axis=1)
    df_filtered = df_filtered[mask]
    rows_removed = initial_rows - len(df_filtered)
    if rows_removed > 0: print(f"Removed {rows_removed} rows identified as outliers based on IQR.")
    return df_filtered

# --- Data Loading and Preprocessing Function (MODIFIED DATE HANDLING) ---
def load_and_preprocess_data(path, date_col_name, year_col, month_col, day_col, target_cols, dataset_name="Training"):
    """Loads data and constructs the date from year, month, day columns."""
    print("-" * 30); print(f"Processing {dataset_name} Dataset"); print("-" * 30)
    try:
        print(f"Loading {dataset_name} data from {path}...")
        df = pd.read_csv(path)
        print(f"Successfully loaded {len(df)} rows.")

        # --- 1. Construct Date Column from year, month, day ---
        date_components_cols = [year_col, month_col, day_col]
        if not all(col in df.columns for col in date_components_cols):
            missing_dt_cols = [col for col in date_components_cols if col not in df.columns]
            print(f"Error: Date component columns missing: {missing_dt_cols}")
            return None

        # Ensure components are numeric before assembly
        for col in date_components_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        df.dropna(subset=date_components_cols, inplace=True) # Drop rows if year/month/day invalid

        print(f"Constructing '{date_col_name}' from '{year_col}', '{month_col}', '{day_col}' columns...")
        # Assemble the date - IMPORTANT CORRECTION based on user input
        df[date_col_name] = pd.to_datetime({
            'year': df[year_col],
            'month': df[month_col],
            'day': df[day_col]  # Use the column user specified for day
        }, errors='coerce')

        # Drop rows where constructed date is invalid (NaT)
        initial_rows_date = len(df)
        df.dropna(subset=[date_col_name], inplace=True)
        if initial_rows_date > len(df):
             print(f"Dropped {initial_rows_date - len(df)} rows due to invalid date component combinations.")
        print(f"{len(df)} rows after date construction and validation.")
        # --- End Date Construction ---

        # 2. Initial dropna()
        initial_rows = len(df); df.dropna(inplace=True)
        if initial_rows > len(df): print(f"{len(df)} rows after initial broad dropna(). {initial_rows - len(df)} removed.")

        # 3. Ensure Price columns are numeric
        for col in target_cols:
            if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
            else: print(f"Warning: Target column '{col}' not found.")
        df.dropna(subset=target_cols, inplace=True, how='any')
        print(f"{len(df)} rows after ensuring target columns numeric.")

        # 4. Drop OTHER unused columns
        cols_to_drop = ['calculationType', 'district_id', 'change',
                        'district_name_enc', 'commodity_name_enc', 'state_name_enc']
        existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns]
        if existing_cols_to_drop:
            df.drop(columns=existing_cols_to_drop, axis=1, inplace=True)
            # print(f"Dropped potentially existing unused columns: {existing_cols_to_drop}") # Less verbose

        # 5. Apply IQR Outlier Removal
        df = remove_outliers_iqr(df, target_cols)

        # Final check for required columns
        required_numeric_filter_cols = ['state_name', 'district_name', 'commodity_name']
        required_cols = [date_col_name] + target_cols + required_numeric_filter_cols # Use constructed date_col_name
        missing_req_cols = [col for col in required_cols if col not in df.columns]
        if missing_req_cols:
            print(f"Error: Required columns missing: {missing_req_cols}"); print(f"Available: {df.columns.tolist()}"); return None
        for col in required_numeric_filter_cols:
             if not pd.api.types.is_numeric_dtype(df[col]): print(f"Error: Col '{col}' expected numeric but isn't."); return None

        df.sort_values(date_col_name, inplace=True) # Sort by constructed date
        print(f"{dataset_name} data preprocessing complete. {len(df)} rows remaining.")
        return df

    except FileNotFoundError: print(f"Error: {dataset_name} file not found at {path}"); return None
    except Exception as e: print(f"Error loading/preprocessing {dataset_name}: {e}"); traceback.print_exc(); return None

# --- Optuna Objective Function (Use DATE_COLUMN name) ---
def objective(trial, train_df, val_df, target_col, date_col_name): # Use date_col_name
    changepoint_prior_scale = trial.suggest_float('changepoint_prior_scale', 0.0001, 0.5, log=True)
    seasonality_prior_scale = trial.suggest_float('seasonality_prior_scale', 0.01, 10.0, log=True)
    seasonality_mode = trial.suggest_categorical('seasonality_mode', ['additive', 'multiplicative'])
    params = {
        'changepoint_prior_scale': changepoint_prior_scale, 'seasonality_prior_scale': seasonality_prior_scale,
        'seasonality_mode': seasonality_mode, 'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False
    }
    try:
        prophet_df_train = train_df[[date_col_name, target_col]].rename(columns={date_col_name: 'ds', target_col: 'y'})
        if len(prophet_df_train) < MIN_DATA_POINTS_TRAIN: return float('inf')
        model = Prophet(**params); model.fit(prophet_df_train)
        future_df_val = val_df[[date_col_name]].rename(columns={date_col_name: 'ds'}) # Use date_col_name
        forecast = model.predict(future_df_val)
        results = pd.merge(val_df[[date_col_name, target_col]], forecast[['ds', 'yhat']],
                           left_on=date_col_name, right_on='ds', how='inner') # Use date_col_name
        if results.empty: return float('inf')
        mae = mean_absolute_error(results[target_col], results['yhat'])
        return mae
    except Exception as e: return float('inf')

# --- Training Function (Use DATE_COLUMN name) ---
def train_prophet_model(training_data, target_column, date_col_name, params): # Use date_col_name
    print(f"\nTraining FINAL Prophet model for '{target_column}' using best parameters..."); print(f"Best Params: {params}")
    prophet_df = training_data[[date_col_name, target_column]].rename(columns={date_col_name: 'ds', target_column: 'y'})
    if len(prophet_df) < MIN_DATA_POINTS_TRAIN: print(f"Warning: Insufficient data ({len(prophet_df)} points). Skipping."); return None
    try:
        model = Prophet(**params); model.fit(prophet_df)
        print("Final model training complete."); return model
    except Exception as e: print(f"Error during FINAL training for {target_column}: {e}"); return None

# --- Evaluation Metrics Function (No changes) ---
def calculate_metrics(y_true, y_pred):
    y_true = np.array(y_true); y_pred = np.array(y_pred)
    if len(y_true) == 0 or len(y_pred) == 0: return np.nan, np.nan, np.nan
    if len(y_true) != len(y_pred):
        min_len = min(len(y_true), len(y_pred)); print(f"Warn: Mismatch metrics. Truncating to {min_len}.");
        if min_len == 0 : return np.nan, np.nan, np.nan
        y_true = y_true[:min_len]; y_pred = y_pred[:min_len]
    try:
        r2 = r2_score(y_true, y_pred); mae = mean_absolute_error(y_true, y_pred); mse = mean_squared_error(y_true, y_pred)
        return r2, mae, mse
    except Exception as e: print(f"Error calculating metrics: {e}"); return np.nan, np.nan, np.nan

# --- Plotting Function (Use DATE_COLUMN name) ---
def plot_validation_results(validation_actuals, validation_predictions, target_column, date_col_name, title): # Use date_col_name
    fig = go.Figure(); target_label = target_column.replace("avg_", "").replace("_price", "").capitalize()
    actual_data_plot = validation_actuals[[date_col_name, target_column]].dropna()
    fig.add_trace(go.Scatter(x=actual_data_plot[date_col_name], y=actual_data_plot[target_column], mode='lines+markers', name=f'Actual {target_label} (2024)', line=dict(color='blue'), marker=dict(size=4)))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat'], mode='lines', name=f'Predicted {target_label} (2024)', line=dict(color='red')))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat_upper'], mode='lines', name='Upper Bound', line=dict(width=0), showlegend=False))
    fig.add_trace(go.Scatter(x=validation_predictions['ds'], y=validation_predictions['yhat_lower'], mode='lines', name='Lower Bound', line=dict(width=0), fillcolor='rgba(255, 0, 0, 0.2)', fill='tonexty', showlegend=False))
    fig.update_layout(title=title, xaxis_title='Date (2024)', yaxis_title=f'Price ({target_label})', hovermode="x unified", legend_title_text='Legend')
    return fig


# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---")
print(f"--- (Nashik: 2002-2023 Train, 2024 Validate - Using Pre-Encoded Filters & Constructed Date) ---") # Updated title

# Load Training data (pass date component column names)
df_train_full = load_and_preprocess_data(DATA_PATH_TRAIN, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Training (2002-2023)")

# Load Validation data (pass date component column names)
df_val_full = load_and_preprocess_data(DATA_PATH_VALIDATION, DATE_COLUMN, YEAR_COL, MONTH_COL, DAY_COL, TARGET_COLUMNS, "Validation (2024)")


# Proceed only if both datasets loaded successfully
if df_train_full is not None and df_val_full is not None:

    # --- Get Encoded Values for Filtering ---
    try:
        selected_state_key=SELECTED_STATE_STR.strip().lower(); selected_district_key=SELECTED_DISTRICT_STR.strip().lower(); selected_commodity_key=SELECTED_COMMODITY_STR.strip().lower()
        encoded_state = state_name_encoding_map.get(selected_state_key); encoded_district = district_name_encoding_map.get(selected_district_key); encoded_commodity = commodity_name_encoding_map.get(selected_commodity_key)
        lookup_failed = False
        if encoded_state is None: print(f"Error: State '{SELECTED_STATE_STR}' missing in map."); lookup_failed=True
        if encoded_district is None: print(f"Error: District '{SELECTED_DISTRICT_STR}' missing in map."); lookup_failed=True
        if encoded_commodity is None: print(f"Error: Commodity '{SELECTED_COMMODITY_STR}' missing in map."); lookup_failed=True
        if lookup_failed: print("Check encoding maps."); df_train_full=df_val_full=None
        else: print(f"\nSelected: State='{SELECTED_STATE_STR}', District='{SELECTED_DISTRICT_STR}', Commodity='{SELECTED_COMMODITY_STR}' -> Encoded: St={encoded_state}, Di={encoded_district}, Co={encoded_commodity}")
    except Exception as e: print(f"Error mapping lookup: {e}"); df_train_full = df_val_full = None


# Proceed only if lookup succeeded
if df_train_full is not None and df_val_full is not None:

    # --- Filtering Data Based on ENCODED Values ---
    print(f"\nFiltering datasets using encoded values...")
    filter_cols_num = ['state_name', 'district_name', 'commodity_name']

    # Filter Training Data
    if not all(col in df_train_full.columns for col in filter_cols_num): print("Error: Encoded filter cols missing in Training data."); filtered_df_train = pd.DataFrame()
    else:
        filtered_df_train = df_train_full[(df_train_full['state_name'] == encoded_state) & (df_train_full['district_name'] == encoded_district) & (df_train_full['commodity_name'] == encoded_commodity)].copy()
        filtered_df_train.sort_values(by=DATE_COLUMN, inplace=True) # Sort by constructed date

    # Filter Validation Data
    if not all(col in df_val_full.columns for col in filter_cols_num): print("Error: Encoded filter cols missing in Validation data."); filtered_df_val = pd.DataFrame()
    else:
        filtered_df_val = df_val_full[(df_val_full['state_name'] == encoded_state) & (df_val_full['district_name'] == encoded_district) & (df_val_full['commodity_name'] == encoded_commodity)].copy()
        filtered_df_val.sort_values(by=DATE_COLUMN, inplace=True) # Sort by constructed date

    if filtered_df_train.empty: print("\nWarning: No training data found after filtering.")
    if filtered_df_val.empty: print("\nWarning: No validation data found after filtering.")

    # Proceed only if BOTH filtered datasets have data
    if not filtered_df_train.empty and not filtered_df_val.empty:
        print(f"\nFound {len(filtered_df_train)} training points and {len(filtered_df_val)} validation points after filtering.")

        # --- Loop through each target price type ---
        for target in TARGET_COLUMNS:
            print("=" * 60); print(f"Processing Target: {target}"); print("=" * 60)
            target_df_train = filtered_df_train[[DATE_COLUMN, target]].dropna().copy() # Use constructed date
            target_df_val = filtered_df_val[[DATE_COLUMN, target]].dropna().copy()   # Use constructed date

            if target_df_train.empty or target_df_val.empty: print(f"Warning: Insufficient data for train/val for '{target}'. Skipping."); continue

            # --- Hyperparameter Tuning with Optuna ---
            print(f"\nStarting Optuna tuning for '{target}' ({N_TRIALS} trials)...")
            study = optuna.create_study(direction='minimize')
            try:
                objective_func = lambda trial: objective(trial, target_df_train, target_df_val, target, DATE_COLUMN) # Pass constructed date col name
                study.optimize(objective_func, n_trials=N_TRIALS)
                best_params = study.best_params; print(f"Optuna finished. Best MAE: {study.best_value:.4f}")
                final_params = {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, **best_params}
            except Exception as e:
                print(f"Optuna study failed: {e}. Using defaults."); traceback.print_exc()
                final_params = {'changepoint_prior_scale': 0.05, 'seasonality_prior_scale': 10.0, 'seasonality_mode': 'additive',
                                'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False}

            # --- Train FINAL Model ---
            model = train_prophet_model(target_df_train, target, DATE_COLUMN, final_params) # Pass constructed date col name

            if model is not None:
                # --- Predict on Validation Period (2024) ---
                print(f"\nPredicting on validation period (2024) for '{target}'...")
                future_df_val = target_df_val[[DATE_COLUMN]].rename(columns={DATE_COLUMN: 'ds'}) # Use constructed date
                try:
                    validation_forecast = model.predict(future_df_val); print("Prediction complete.")

                    # --- Evaluate FINAL Model ---
                    print(f"\n--- Evaluating FINAL Model on 2024 Data for {target} ---")
                    validation_results = pd.merge(target_df_val[[DATE_COLUMN, target]], validation_forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']],
                                                  left_on=DATE_COLUMN, right_on='ds', how='inner') # Use constructed date

                    if not validation_results.empty:
                        actuals_val=validation_results[target]; preds_val=validation_results['yhat']
                        r2_val, mae_val, mse_val = calculate_metrics(actuals_val, preds_val)
                        print(f"FINAL Validation R2: {r2_val:.4f}, MAE: {mae_val:.2f}, MSE: {mse_val:.2f}")

                        # --- Plot FINAL Validation Results ---
                        print(f"\n--- Plotting FINAL Validation Results for {target} (Actual vs. Predicted 2024) ---")
                        plot_title_val = f'TUNED Validation (Nashik/Wheat): {target.replace("avg_", "").replace("_price", "").capitalize()} Price (Actual vs. Predicted 2024)'
                        fig_val = plot_validation_results(target_df_val, validation_results, target, DATE_COLUMN, plot_title_val) # Pass constructed date col name
                        fig_val.show()
                    else: print("Warning: Could not align validation actuals/predictions.")
                except Exception as e: print(f"Error during FINAL prediction/evaluation: {e}"); traceback.print_exc()
            else: print(f"Skipping validation for {target} (final model failed).")
        print("-" * 60); print("\nProcess finished.")
    else: print("\nCannot proceed: lack of data after filtering.")
else: print("\nFailed: check data loading, preprocessing, or mapping lookup.")

[I 2025-04-09 20:30:12,656] A new study created in memory with name: no-name-ef8eb82a-6c83-4065-b976-4bf1a5203f4d


--- Crop Price Time Series Forecasting & Validation with Optuna Tuning ---
--- (Nashik: 2002-2023 Train, 2024 Validate - Using Pre-Encoded Filters & Constructed Date) ---
------------------------------
Processing Training (2002-2023) Dataset
------------------------------
Loading Training (2002-2023) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_2002_2023.csv...
Successfully loaded 6246 rows.
Constructing 'full_date' from 'year', 'month', 'date' columns...
6246 rows after date construction and validation.
6246 rows after ensuring target columns numeric.
Removed 10 rows identified as outliers based on IQR.
Training (2002-2023) data preprocessing complete. 6236 rows remaining.
------------------------------
Processing Validation (2024) Dataset
------------------------------
Loading Validation (2024) data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_nashik_test_2024.csv...
Successfully loaded 278 rows.
Constructing

20:30:12 - cmdstanpy - INFO - Chain [1] start processing
20:30:13 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:30:13,712] Trial 0 finished with value: 294.300868046038 and parameters: {'changepoint_prior_scale': 0.005553045000321441, 'seasonality_prior_scale': 0.030191857961311533, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 294.300868046038.
20:30:14 - cmdstanpy - INFO - Chain [1] start processing
20:30:14 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:30:14,834] Trial 1 finished with value: 336.2547585924113 and parameters: {'changepoint_prior_scale': 0.0006381145210987415, 'seasonality_prior_scale': 0.018669494913948082, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 294.300868046038.
20:30:15 - cmdstanpy - INFO - Chain [1] start processing
20:30:16 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:30:16,958] Trial 2 finished with value: 210.76885267926676 and parameters: {'changepoint_prior_scale

Optuna finished. Best MAE: 188.6778

Training FINAL Prophet model for 'avg_min_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.49538705419906787, 'seasonality_prior_scale': 0.38316089798671166, 'seasonality_mode': 'multiplicative'}


20:31:06 - cmdstanpy - INFO - Chain [1] start processing
20:31:08 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_min_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_min_price ---
FINAL Validation R2: -0.4491, MAE: 188.68, MSE: 47641.92

--- Plotting FINAL Validation Results for avg_min_price (Actual vs. Predicted 2024) ---


[I 2025-04-09 20:31:09,045] A new study created in memory with name: no-name-c19a3035-89ce-466a-8009-e21c819c4fe2


Processing Target: avg_max_price

Starting Optuna tuning for 'avg_max_price' (25 trials)...


20:31:09 - cmdstanpy - INFO - Chain [1] start processing
20:31:10 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:31:10,433] Trial 0 finished with value: 432.4033875293997 and parameters: {'changepoint_prior_scale': 0.00017967060519362215, 'seasonality_prior_scale': 0.0459764400448988, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 432.4033875293997.
20:31:11 - cmdstanpy - INFO - Chain [1] start processing
20:31:11 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:31:12,099] Trial 1 finished with value: 463.0340986419641 and parameters: {'changepoint_prior_scale': 0.0005698769688931341, 'seasonality_prior_scale': 5.698501790458859, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 432.4033875293997.
20:31:12 - cmdstanpy - INFO - Chain [1] start processing
20:31:14 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:31:14,319] Trial 2 finished with value: 248.06979686631504 and parameters: {'changepoint_prior_scale': 0.0

Optuna finished. Best MAE: 227.3038

Training FINAL Prophet model for 'avg_max_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.4393513442844508, 'seasonality_prior_scale': 1.3173768537014885, 'seasonality_mode': 'multiplicative'}


20:32:09 - cmdstanpy - INFO - Chain [1] start processing
20:32:11 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_max_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_max_price ---
FINAL Validation R2: -1.5589, MAE: 227.30, MSE: 62135.60

--- Plotting FINAL Validation Results for avg_max_price (Actual vs. Predicted 2024) ---


[I 2025-04-09 20:32:12,172] A new study created in memory with name: no-name-15f6ee2c-d388-43d3-b855-814da882ecbe


Processing Target: avg_modal_price

Starting Optuna tuning for 'avg_modal_price' (25 trials)...


20:32:12 - cmdstanpy - INFO - Chain [1] start processing
20:32:13 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:32:13,526] Trial 0 finished with value: 394.36702090662584 and parameters: {'changepoint_prior_scale': 0.0001318995481623456, 'seasonality_prior_scale': 0.02210845485361886, 'seasonality_mode': 'additive'}. Best is trial 0 with value: 394.36702090662584.
20:32:14 - cmdstanpy - INFO - Chain [1] start processing
20:32:14 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:32:15,025] Trial 1 finished with value: 437.74735192025435 and parameters: {'changepoint_prior_scale': 0.0021936518232433797, 'seasonality_prior_scale': 8.245852759913268, 'seasonality_mode': 'multiplicative'}. Best is trial 0 with value: 394.36702090662584.
20:32:15 - cmdstanpy - INFO - Chain [1] start processing
20:32:16 - cmdstanpy - INFO - Chain [1] done processing
[I 2025-04-09 20:32:16,437] Trial 2 finished with value: 437.692367483256 and parameters: {'changepoint_prior_sca

Optuna finished. Best MAE: 251.8989

Training FINAL Prophet model for 'avg_modal_price' using best parameters...
Best Params: {'yearly_seasonality': True, 'weekly_seasonality': False, 'daily_seasonality': False, 'changepoint_prior_scale': 0.4812606458337515, 'seasonality_prior_scale': 0.5869780523701109, 'seasonality_mode': 'multiplicative'}


20:33:13 - cmdstanpy - INFO - Chain [1] start processing
20:33:16 - cmdstanpy - INFO - Chain [1] done processing


Final model training complete.

Predicting on validation period (2024) for 'avg_modal_price'...
Prediction complete.

--- Evaluating FINAL Model on 2024 Data for avg_modal_price ---
FINAL Validation R2: -2.1237, MAE: 251.90, MSE: 72286.49

--- Plotting FINAL Validation Results for avg_modal_price (Actual vs. Predicted 2024) ---


------------------------------------------------------------

Process finished.
