In [3]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objects as go
import datetime
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from IPython.display import display # Optional: for better dataframe rendering in notebooks

# --- Configuration ---
DATA_PATH = r"E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_21_24.csv"  # Make sure this file is in the same directory or provide the full path
TARGET_COLUMNS = ['avg_min_price', 'avg_max_price', 'avg_modal_price']
DATE_COLUMN = 'date'
MIN_DATA_POINTS = 30 # Minimum data points required to train a model

# --- User Selections (Replace Streamlit Sidebar Inputs) ---
# Set these values manually for your desired forecast
SELECTED_STATE = "Maharashtra"  # Example: Choose a state from your data
SELECTED_DISTRICT = "Akola"     # Example: Choose a district
SELECTED_COMMODITY = "Wheat"    # Example: Choose a commodity
FORECAST_DAYS = 90             # Example: Forecast period

# --- Data Loading Function ---
def load_data(path):
    """Loads and preprocesses the data."""
    try:
        df = pd.read_csv(path)
        print(f"Successfully loaded data from {path}")

        # Basic Preprocessing
        df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], errors='coerce')
        initial_rows = len(df)
        df.dropna(subset=[DATE_COLUMN], inplace=True) # Drop rows where date conversion failed
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to invalid dates.")

        # Ensure price columns are numeric, coerce errors to NaN
        for col in TARGET_COLUMNS:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Optional: Impute missing prices if needed (example using ffill per group)
        # print("Attempting forward fill for missing prices within groups...")
        # df.sort_values([DATE_COLUMN,'state_name', 'district_name', 'commodity_name'], inplace=True)
        # df[TARGET_COLUMNS] = df.groupby(['state_name', 'district_name', 'commodity_name'])[TARGET_COLUMNS].ffill()

        initial_rows = len(df)
        df.dropna(subset=TARGET_COLUMNS, inplace=True) # Drop rows with missing target values
        if initial_rows > len(df):
             print(f"Dropped {initial_rows - len(df)} rows due to missing price data after preprocessing.")

        df.sort_values(DATE_COLUMN, inplace=True)
        print(f"Data preprocessing complete. {len(df)} rows remaining.")
        return df
    except FileNotFoundError:
        print(f"Error: Data file not found at {path}")
        return None
    except Exception as e:
        print(f"Error loading or preprocessing data: {e}")
        return None

# --- Modeling Function ---
def train_and_forecast(data, target_column, forecast_periods):
    """Trains a Prophet model and returns the model, forecast (starting from today),
       and predictions on the historical data for evaluation."""
    # Prepare data for Prophet (requires 'ds' and 'y' columns)
    prophet_df = data[[DATE_COLUMN, target_column]].rename(columns={DATE_COLUMN: 'ds', target_column: 'y'})

    # Check for sufficient data points for training
    if len(prophet_df) < MIN_DATA_POINTS:
        print(f"Warning: Not enough historical data points ({len(prophet_df)}) for '{target_column}' in the selected group to train. Need at least {MIN_DATA_POINTS}. Skipping forecast.")
        return None, None, None # Return None for model, forecast, and historical predictions

    try:
        print(f"\nTraining Prophet model for '{target_column}'...")
        # Instantiate and fit Prophet model on historical data
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False, # Adjust based on expected patterns
            daily_seasonality=False   # Adjust based on expected patterns
        )
        model.fit(prophet_df) # Fit the model using historical data
        print("Model training complete.")

        # --- Create future dataframe STARTING FROM TODAY ---
        # Get today's date (midnight) based on system time when script runs
        current_date = pd.Timestamp.now().normalize()
        # Create a sequence of dates starting from today for the forecast period
        future_dates = pd.date_range(start=current_date, periods=forecast_periods, freq='D')
        future_df = pd.DataFrame({'ds': future_dates})
        # ----------------------------------------------------

        # Generate forecast using the dates starting from today
        print(f"Generating {forecast_periods}-day forecast starting from {current_date.strftime('%Y-%m-%d')}...")
        forecast = model.predict(future_df)
        print("Forecast generation complete.")

        # Generate predictions on historical data for evaluation
        historical_preds = model.predict(prophet_df) # Predict on the same data used for training

        return model, forecast, historical_preds # Return model, future forecast, historical predictions

    except Exception as e:
        print(f"Error during Prophet modeling or forecasting for {target_column}: {e}")
        return None, None, None

# --- Plotting Function for a Single Target ---
def plot_single_forecast(historical_data, forecast_data, target_column, title):
    """Creates a Plotly figure for one target's historical data and forecast."""
    fig = go.Figure()
    target_label = target_column.replace("avg_", "").replace("_price", "").capitalize() # Clean label for display

    # Add historical data trace
    hist_data_col = historical_data[[DATE_COLUMN, target_column]].dropna()
    fig.add_trace(go.Scatter(
        x=hist_data_col[DATE_COLUMN],
        y=hist_data_col[target_column],
        mode='lines',
        name=f'Historical {target_label}',
        line=dict(color='blue')
    ))

    # Add forecast trace (starts from today)
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat'],
        mode='lines',
        name=f'Forecast {target_label}',
        line=dict(color='red', dash='dash')
    ))

    # Add uncertainty interval for the forecast
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_upper'],
        mode='lines', name='Forecast Upper Bound',
        line=dict(width=0),
        showlegend=False
    ))
    fig.add_trace(go.Scatter(
        x=forecast_data['ds'],
        y=forecast_data['yhat_lower'],
        mode='lines', name='Forecast Lower Bound',
        line=dict(width=0),
        fillcolor='rgba(255, 0, 0, 0.2)', # Light red fill for uncertainty
        fill='tonexty',
        showlegend=False
    ))

    fig.update_layout(
        title=title,
        xaxis_title='Date',
        yaxis_title=f'Price ({target_label})',
        hovermode="x unified",
        legend_title_text='Legend'
    )
    return fig

# --- Evaluation Metrics Function ---
def calculate_metrics(y_true, y_pred):
    """Calculates and returns R2, MAE, and MSE."""
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    return r2, mae, mse

# --- Main Execution Block ---
print("--- Crop Price Time Series Forecasting ---")
print(f"Forecasting from current date: {pd.Timestamp.now().normalize().strftime('%Y-%m-%d')}")

# Load data
df_full = load_data(DATA_PATH)

if df_full is not None:
    # --- Filtering Data Based on User Selections ---
    print(f"\nFiltering data for State='{SELECTED_STATE}', District='{SELECTED_DISTRICT}', Commodity='{SELECTED_COMMODITY}'...")
    filtered_df = df_full[
        (df_full['state_name'].str.strip().str.lower() == SELECTED_STATE.strip().lower()) &
        (df_full['district_name'].str.strip().str.lower() == SELECTED_DISTRICT.strip().lower()) &
        (df_full['commodity_name'].str.strip().str.lower() == SELECTED_COMMODITY.strip().lower())
    ].copy() # Use copy to avoid SettingWithCopyWarning

    # Ensure data is sorted by date (important for plotting historical correctly)
    filtered_df.sort_values(by=DATE_COLUMN, inplace=True)

    if filtered_df.empty:
        print("\nWarning: No historical data found for the selected combination.")
        print("Please check the CSV file and your selections (State, District, Commodity).")
    else:
        last_hist_date = filtered_df[DATE_COLUMN].max().strftime('%Y-%m-%d')
        print(f"\nFound {len(filtered_df)} historical data points (Latest: {last_hist_date}).")
        print(f"Proceeding with forecast for {FORECAST_DAYS} days.")

        all_forecasts = {} # Dictionary to store forecasts if needed later

        # Loop through each target price type
        for target in TARGET_COLUMNS:
            print("-" * 50)
            print(f"Processing Target: {target}")

            # Check if target column exists and has data after filtering
            if target not in filtered_df.columns or filtered_df[target].isnull().all():
                print(f"Warning: Target column '{target}' not found or contains only null values for the selection. Skipping.")
                continue

            # Prepare data for this specific target (dropping NaNs for this target)
            target_df = filtered_df[[DATE_COLUMN, target]].dropna().copy()
            if target_df.empty:
                 print(f"Warning: No valid data points for '{target}' after dropping NaNs. Skipping.")
                 continue

            # Train model, get forecast (future dates), and get historical predictions
            model, forecast, historical_preds = train_and_forecast(target_df, target, FORECAST_DAYS)

            if forecast is not None and historical_preds is not None:
                all_forecasts[target] = forecast # Store the forecast

                # --- Evaluate Model Fit on Historical Data ---
                print(f"\n--- Evaluating Model Fit for {target} (on historical data) ---")
                actuals = target_df[target] # Ground truth from the training data
                preds = historical_preds['yhat'] # Predictions on the training data
                
                # Ensure alignment - Prophet predictions match the input df length/order
                if len(actuals) == len(preds):
                    r2, mae, mse = calculate_metrics(actuals, preds)
                    print(f"R-squared (R2): {r2:.4f}")
                    print(f"Mean Absolute Error (MAE): {mae:.2f}")
                    print(f"Mean Squared Error (MSE): {mse:.2f}")
                else:
                    print("Warning: Mismatch between actuals and predictions length. Cannot calculate metrics accurately.")
                    print(f"Actuals length: {len(actuals)}, Predictions length: {len(preds)}")


                # --- Plot Historical Data and Forecast ---
                print(f"\n--- Plotting Historical Data & Forecast for {target} ---")
                plot_title = f'{target.replace("avg_", "").replace("_price", "").capitalize()} Price: Historical & {FORECAST_DAYS}-Day Forecast\n({SELECTED_COMMODITY} in {SELECTED_DISTRICT}, {SELECTED_STATE})'
                fig = plot_single_forecast(target_df, forecast, target, plot_title)
                fig.show() # Display the plot in the notebook output

                # --- Display Forecast Data Table ---
                print(f"\n--- Forecast Data Table for {target} ({FORECAST_DAYS} days) ---")
                f_display = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
                f_display.columns = ['Date', 'Forecast', 'Lower Bound', 'Upper Bound']
                f_display['Date'] = f_display['Date'].dt.strftime('%Y-%m-%d') # Format date
                # Use display for potentially nicer formatting in Jupyter, or just print
                display(f_display.set_index('Date').style.format("{:.2f}"))
                # Alternatively: print(f_display.set_index('Date').round(2).to_string())

            else:
                # Message already printed in train_and_forecast if skipped
                print(f"Skipping results display for {target} due to insufficient data or error during modeling.")

        print("-" * 50)
        print("\nForecasting process finished.")

else:
    print("\nFailed to load data. Cannot run the forecasting process.")

--- Crop Price Time Series Forecasting ---
Forecasting from current date: 2025-04-06
Successfully loaded data from E:\elevatetrsest\crop price predictor\Crop_price_Prediction\data\edited_21_24.csv
Data preprocessing complete. 27676 rows remaining.

Filtering data for State='Maharashtra', District='Akola', Commodity='Wheat'...


AttributeError: Can only use .str accessor with string values!